From f1359b11f2cdeac6754b7d09deb7c4a24ff69e73 Mon Sep 17 00:00:00 2001 From: Darren Edge Date: Wed, 3 Apr 2024 12:45:33 +0100 Subject: [PATCH] Demo edits --- .streamlit/config.toml | 3 ++- README.md | 2 +- app/util/Embedder.py | 9 +++++-- app/util/ui_components.py | 18 ++++++++----- app/workflows/attribute_patterns/functions.py | 12 ++++----- app/workflows/attribute_patterns/workflow.py | 3 ++- app/workflows/question_answering/functions.py | 13 +++------- app/workflows/risk_networks/workflow.py | 26 ++++++++----------- 8 files changed, 44 insertions(+), 42 deletions(-) diff --git a/.streamlit/config.toml b/.streamlit/config.toml index 2c639d32..b609c543 100644 --- a/.streamlit/config.toml +++ b/.streamlit/config.toml @@ -1,2 +1,3 @@ [server] -enableXsrfProtection = false \ No newline at end of file +enableXsrfProtection = false +maxUploadSize = 1000 \ No newline at end of file diff --git a/README.md b/README.md index 1f081cee..d2aa7fb1 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ pip install -r requirements.txt streamlit run app/Home.py ## PDF export -Install wkhtmltopdf to be able to generate the final story in PDF: +Install wkhtmltopdf to be able to generate the final reports in PDF: Windows: [Download wkhtmltopdf installer](https://wkhtmltopdf.org/downloads.html) diff --git a/app/util/Embedder.py b/app/util/Embedder.py index c68aac35..d80b59c5 100644 --- a/app/util/Embedder.py +++ b/app/util/Embedder.py @@ -3,6 +3,7 @@ import numpy as np from util.Database import Database import util.session_variables +import streamlit as st gen_model = 'gpt-4-turbo-preview' embed_model = 'text-embedding-3-small' @@ -39,14 +40,18 @@ def encode_all(self, texts): final_embeddings[ix] = np.array(embeddings) print(f'Got {len(new_texts)} new texts') # split into batches of 2000 + pb = st.progress(0, 'Embedding text batches...') for i in range(0, len(new_texts), 2000): + pb.progress((i+1) / len(new_texts), f'Embedding text batch {i+1} of {len(new_texts)}...') batch = new_texts[i:i+2000] batch_texts = [x[1] for x in batch] embeddings = [x.embedding for x in client.embeddings.create(input = batch_texts, model=self.model).data] for j, (ix, text) in enumerate(batch): - hsh = hash(text) - self.connection.insert_into_embeddings(hsh, embeddings[j]) + print(j) + # hsh = hash(text) + # self.connection.insert_into_embeddings(hsh, embeddings[j]) final_embeddings[ix] = np.array(embeddings[j]) + pb.empty() return np.array(final_embeddings) def encode(self, text): diff --git a/app/util/ui_components.py b/app/util/ui_components.py index 311f3f85..58b06643 100644 --- a/app/util/ui_components.py +++ b/app/util/ui_components.py @@ -139,15 +139,19 @@ def multi_csv_uploader(upload_label, uploaded_files_var, outputs_dir, key, max_r st.number_input('Maximum rows to process (0 = all)', min_value=0, step=1000, key=max_rows_var.key) if files != None: for file in files: - if file.name not in uploaded_files_var.value: - df = pd.read_csv(file, encoding='unicode-escape', encoding_errors='ignore') - df.to_csv(os.path.join(outputs_dir, file.name), index=False) + if file.name not in uploaded_files_var.value: uploaded_files_var.value.append(file.name) selected_file = st.selectbox("Select a file to process", ['']+uploaded_files_var.value) - + with st.expander('File options'): + encoding = st.selectbox('File encoding', options=['unicode-escape', 'utf-8', 'utf-8-sig'], key=f'{key}_encoding') + reload = st.button('Reload', key=f'{key}_reload') + df = pd.DataFrame() - if selected_file not in [None, '']: - df = pd.read_csv(os.path.join(outputs_dir, selected_file), encoding='unicode-escape', nrows=max_rows_var.value, encoding_errors='ignore') if max_rows_var.value > 0 else pd.read_csv(os.path.join(outputs_dir, selected_file), encoding='unicode-escape', encoding_errors='ignore') + if selected_file not in [None, ''] or reload: + for file in files: + if file.name == selected_file: + df = pd.read_csv(file, encoding='unicode-escape', nrows=max_rows_var.value, encoding_errors='ignore') if max_rows_var.value > 0 else pd.read_csv(file, encoding='unicode-escape', encoding_errors='ignore') + break st.dataframe(df[:show_rows], hide_index=True, use_container_width=True, height=height) return selected_file, df @@ -163,7 +167,7 @@ def prepare_input_df(workflow, input_df_var, processed_df_var, output_df_var, id st.session_state[f'{workflow}_binned_df'] = input_df_var.value.copy(deep=True) with st.expander('Set subject identifier', expanded=False): - identifier = st.radio('Subject identifier', options=['Row number', 'ID column']) + identifier = st.radio('Subject identifier', options=['Row number', 'ID column'], help='Select row number if each row of data represents a distinct individual, otherwise select ID column to link multiple rows to the same individual via their ID.') if identifier == 'ID column': options = ['']+list(input_df_var.value.columns.values) identifier_col = st.selectbox('Select subject identifier column', options=options) diff --git a/app/workflows/attribute_patterns/functions.py b/app/workflows/attribute_patterns/functions.py index 2e10b963..02274813 100644 --- a/app/workflows/attribute_patterns/functions.py +++ b/app/workflows/attribute_patterns/functions.py @@ -96,7 +96,6 @@ def prepare_graph(sv, mi=False): def generate_embedding(sv, df, time_to_graph): period_embeddings = {} node_list = sorted(df['Full Attribute'].unique().tolist()) - print(node_list) sorted_att_types = sorted(df['Attribute Type'].unique()) node_to_ix = {n : i for i, n in enumerate(node_list)} node_to_label = {n : sorted_att_types.index(n.split(config.type_val_sep)[0]) for n in node_list} @@ -230,7 +229,6 @@ def detect_patterns(sv): for (a, b) in period_pairs: if len(pattern) > 0 and ((a in pattern and b in pattern) or (a not in pattern and b not in pattern)): continue - # print(f'checking {a} and {b} in {pattern}') candidate = None if (a in pattern and b not in pattern): candidate = [b] @@ -253,7 +251,6 @@ def detect_patterns(sv): if pcount > sv.attribute_min_pattern_count.value: period_to_patterns[period].append((candidate_pattern, pcount)) pattern_to_periods[tuple(candidate_pattern)].add(period) - # print(f'In {period} added {candidate_pattern} with count {pcount}') print('done combining pairs') # convert to df pattern_rows = [] @@ -263,11 +260,14 @@ def detect_patterns(sv): mean, sd, mx = rc.compute_period_mean_sd_max(pattern) score = (count - mean) / sd if score >= 0: - row = [period, ' & '.join(pattern), len(pattern), len(pattern_to_periods[tuple(pattern)]), count, round(mean, 0), round(score, 2)] + row = [period, ' & '.join(pattern), len(pattern), count, round(mean, 0), round(score, 2)] pattern_rows.append(row) - columns = ['period', 'pattern', 'length', 'detections', 'count', 'mean', 'z_score'] + columns = ['period', 'pattern', 'length', 'count', 'mean', 'z_score'] pattern_df = pd.DataFrame(pattern_rows, columns=columns) - pattern_df['overall_score'] = pattern_df['z_score'] * pattern_df['length'] * np.log(pattern_df['count']) * pattern_df['detections'] + # count number of periods per pattern + detections = pattern_df.groupby('pattern')['period'].count().reset_index().rename(columns={'period' : 'detections'}) + pattern_df = pattern_df.merge(detections, on='pattern') + pattern_df['overall_score'] = pattern_df['z_score'] * pattern_df['length'] * pattern_df['detections'] * np.log(pattern_df['count'] + 1) # normalize overall score max_score = pattern_df['overall_score'].max() pattern_df['overall_score'] = pattern_df['overall_score'].apply(lambda x: round((x) / (max_score), 2)) diff --git a/app/workflows/attribute_patterns/workflow.py b/app/workflows/attribute_patterns/workflow.py index e84c229b..ff27a561 100644 --- a/app/workflows/attribute_patterns/workflow.py +++ b/app/workflows/attribute_patterns/workflow.py @@ -56,6 +56,7 @@ def create(): pdf = pd.melt(pdf, id_vars=['Subject ID', 'Period'], value_vars=att_cols, var_name='Attribute Type', value_name='Attribute Value') pdf = pdf[pdf['Attribute Value'] != ''] pdf['Full Attribute'] = pdf.apply(lambda x: str(x['Attribute Type']) + config.type_val_sep + str(x['Attribute Value']), axis=1) + pdf = pdf[pdf['Period'] != ''] sv.attribute_dynamic_df.value = pdf if ready and len(sv.attribute_dynamic_df.value) > 0: st.markdown(f'Graph model has **{len(sv.attribute_dynamic_df.value)}** links spanning **{len(sv.attribute_dynamic_df.value["Subject ID"].unique())}** cases, **{len(sv.attribute_dynamic_df.value["Full Attribute"].unique())}** attributes, and **{len(sv.attribute_dynamic_df.value["Period"].unique())}** periods.') @@ -88,7 +89,7 @@ def create(): period_count = len(sv.attribute_pattern_df.value["period"].unique()) pattern_count = len(sv.attribute_pattern_df.value) unique_count = len(sv.attribute_pattern_df.value['pattern'].unique()) - st.markdown(f'Over **{period_count}** periods, detected **{pattern_count}** attribute patterns (**{unique_count}** unique) from **{sv.attribute_converging_pairs.value}**/**{sv.attribute_all_pairs.value}** converging attribute pairs (**{round(sv.attribute_converging_pairs.value / sv.attribute_all_pairs.value * 100, 2) if sv.attribute_all_pairs.value > 0 else 0}%**).') + st.markdown(f'Over **{period_count}** periods, detected **{pattern_count}** attribute patterns (**{unique_count}** unique) from **{sv.attribute_converging_pairs.value}**/**{sv.attribute_all_pairs.value}** converging attribute pairs (**{round(sv.attribute_converging_pairs.value / sv.attribute_all_pairs.value * 100, 2) if sv.attribute_all_pairs.value > 0 else 0}%**). Patterns ranked by ```overall_score = normalize(length * ln(count) * z_score * detections)```.') show_df = sv.attribute_pattern_df.value tdf = functions.create_time_series_df(sv.attribute_record_counter.value, sv.attribute_pattern_df.value) gb = GridOptionsBuilder.from_dataframe(show_df) diff --git a/app/workflows/question_answering/functions.py b/app/workflows/question_answering/functions.py index 07bd2e02..705bb0e8 100644 --- a/app/workflows/question_answering/functions.py +++ b/app/workflows/question_answering/functions.py @@ -21,9 +21,10 @@ ) def chunk_files(sv, files): - pb = st.progress(0, 'Chunking and embedding files...') + pb = st.progress(0, 'Chunking files...') file_chunks = [] - for file_link in files: + for fx, file_link in enumerate(files): + pb.progress((fx+1) / len(files), f'Chunking file {fx+1} of {len(files)}...') file_names = [f.name for f in sv.answering_files.value.values()] doc_text = '' if file_link.name not in file_names: @@ -32,13 +33,7 @@ def chunk_files(sv, files): file = classes.File(file_link.name, file_id) sv.answering_files.value[file_id] = file bytes = file_link.getvalue() - # path = f'{config.cache_dir}\\qa_mine\\raw_files' - # if not os.path.exists(path): - # os.makedirs(path) - # path = os.path.join(path, file.name) - # with open(path, 'wb') as f: - # f.write(bytes) pdf_reader = pdfplumber.open(io.BytesIO(bytes)) for px in range(len(pdf_reader.pages)): page_text = pdf_reader.pages[px].extract_text() @@ -70,7 +65,7 @@ def update_question(sv, question_history, new_questions, placeholder, prefix): system_message = """\ You are a helpful assistant augmenting a user question with any relevant keywords (e.g., entities, concepts, or knowledge) found in a list of input questions, each of which is prefixed by the question ID. -Any relevant keywords should be inserted as a list, enclosed by parentheses, at the appropriate point in the question, with each keywords item referencing the supporting question IDs using "( [Q, Q...], [Q, Q...], ...)". +Any keywords that directly help answer the question should be inserted as a list, enclosed by parentheses, at the appropriate point in the question, with each keywords item referencing the supporting question IDs using "( [Q, Q...], [Q, Q...], ...)". Do not insert any text indicating lack of relevant keywords, and do not remove any text (including question references) already present in the previous augmented question unless it is clearly irrelevant. diff --git a/app/workflows/risk_networks/workflow.py b/app/workflows/risk_networks/workflow.py index 8856fc5b..928dbdad 100644 --- a/app/workflows/risk_networks/workflow.py +++ b/app/workflows/risk_networks/workflow.py @@ -202,22 +202,18 @@ def create(): st.multiselect('Select node types to fuzzy match', options=sorted([config.entity_label] + list(sv.network_node_types.value)), key=sv.network_indexed_node_types.key) if st.button('Index nodes', disabled=len(sv.network_indexed_node_types.value) == 0): - with st.spinner('Indexing nodes...'): - - text_types = list([(n, d['type']) for n, d in sv.network_overall_graph.value.nodes(data=True) if d['type'] in sv.network_indexed_node_types.value]) - texts = [t[0] for t in text_types] - - df = pd.DataFrame(text_types, columns=['text', 'type']) - embeddings = embedder.encode_all(texts) - vals = [(n, t, e) for (n, t), e in zip(text_types, embeddings)] - edf = pd.DataFrame(vals, columns=['text', 'type', 'vector']) + text_types = list([(n, d['type']) for n, d in sv.network_overall_graph.value.nodes(data=True) if d['type'] in sv.network_indexed_node_types.value]) + texts = [t[0] for t in text_types] + + df = pd.DataFrame(text_types, columns=['text', 'type']) + embeddings = embedder.encode_all(texts) + vals = [(n, t, e) for (n, t), e in zip(text_types, embeddings)] + edf = pd.DataFrame(vals, columns=['text', 'type', 'vector']) - edf = edf[edf['text'].isin(texts)] - sv.network_embedded_texts.value = edf['text'].tolist() - # edf['vector'] = edf['vector'].apply(lambda x : np.array([np.float32(y) for y in x[1:-1].split(' ') if y != ''])) - # embeddings = np.array(edf['vector'].tolist()) - nbrs = NearestNeighbors(n_neighbors=20, n_jobs=1, algorithm='auto', leaf_size=20, metric='cosine').fit(embeddings) - sv.network_nearest_text_distances.value, sv.network_nearest_text_indices.value = nbrs.kneighbors(embeddings) + edf = edf[edf['text'].isin(texts)] + sv.network_embedded_texts.value = edf['text'].tolist() + nbrs = NearestNeighbors(n_neighbors=20, n_jobs=1, algorithm='auto', leaf_size=20, metric='cosine').fit(embeddings) + sv.network_nearest_text_distances.value, sv.network_nearest_text_indices.value = nbrs.kneighbors(embeddings) st.markdown(f'*Number of nodes indexed*: {len(sv.network_embedded_texts.value)}') st.markdown('##### Infer links') st.number_input('Similarity threshold', min_value=0.0, max_value=1.0, key=sv.network_similarity_threshold.key, step=0.01, value=sv.network_similarity_threshold.value)