Skip to content

Commit

Permalink
Demo edits
Browse files Browse the repository at this point in the history
  • Loading branch information
Darren Edge committed Apr 3, 2024
1 parent fe864bf commit f1359b1
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 42 deletions.
3 changes: 2 additions & 1 deletion .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
[server]
enableXsrfProtection = false
enableXsrfProtection = false
maxUploadSize = 1000
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pip install -r requirements.txt
streamlit run app/Home.py

## PDF export
Install wkhtmltopdf to be able to generate the final story in PDF:
Install wkhtmltopdf to be able to generate the final reports in PDF:

Windows: [Download wkhtmltopdf installer](https://wkhtmltopdf.org/downloads.html)

Expand Down
9 changes: 7 additions & 2 deletions app/util/Embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from util.Database import Database
import util.session_variables
import streamlit as st

gen_model = 'gpt-4-turbo-preview'
embed_model = 'text-embedding-3-small'
Expand Down Expand Up @@ -39,14 +40,18 @@ def encode_all(self, texts):
final_embeddings[ix] = np.array(embeddings)
print(f'Got {len(new_texts)} new texts')
# split into batches of 2000
pb = st.progress(0, 'Embedding text batches...')
for i in range(0, len(new_texts), 2000):
pb.progress((i+1) / len(new_texts), f'Embedding text batch {i+1} of {len(new_texts)}...')
batch = new_texts[i:i+2000]
batch_texts = [x[1] for x in batch]
embeddings = [x.embedding for x in client.embeddings.create(input = batch_texts, model=self.model).data]
for j, (ix, text) in enumerate(batch):
hsh = hash(text)
self.connection.insert_into_embeddings(hsh, embeddings[j])
print(j)
# hsh = hash(text)
# self.connection.insert_into_embeddings(hsh, embeddings[j])
final_embeddings[ix] = np.array(embeddings[j])
pb.empty()
return np.array(final_embeddings)

def encode(self, text):
Expand Down
18 changes: 11 additions & 7 deletions app/util/ui_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,19 @@ def multi_csv_uploader(upload_label, uploaded_files_var, outputs_dir, key, max_r
st.number_input('Maximum rows to process (0 = all)', min_value=0, step=1000, key=max_rows_var.key)
if files != None:
for file in files:
if file.name not in uploaded_files_var.value:
df = pd.read_csv(file, encoding='unicode-escape', encoding_errors='ignore')
df.to_csv(os.path.join(outputs_dir, file.name), index=False)
if file.name not in uploaded_files_var.value:
uploaded_files_var.value.append(file.name)
selected_file = st.selectbox("Select a file to process", ['']+uploaded_files_var.value)

with st.expander('File options'):
encoding = st.selectbox('File encoding', options=['unicode-escape', 'utf-8', 'utf-8-sig'], key=f'{key}_encoding')
reload = st.button('Reload', key=f'{key}_reload')

df = pd.DataFrame()
if selected_file not in [None, '']:
df = pd.read_csv(os.path.join(outputs_dir, selected_file), encoding='unicode-escape', nrows=max_rows_var.value, encoding_errors='ignore') if max_rows_var.value > 0 else pd.read_csv(os.path.join(outputs_dir, selected_file), encoding='unicode-escape', encoding_errors='ignore')
if selected_file not in [None, ''] or reload:
for file in files:
if file.name == selected_file:
df = pd.read_csv(file, encoding='unicode-escape', nrows=max_rows_var.value, encoding_errors='ignore') if max_rows_var.value > 0 else pd.read_csv(file, encoding='unicode-escape', encoding_errors='ignore')
break
st.dataframe(df[:show_rows], hide_index=True, use_container_width=True, height=height)
return selected_file, df

Expand All @@ -163,7 +167,7 @@ def prepare_input_df(workflow, input_df_var, processed_df_var, output_df_var, id
st.session_state[f'{workflow}_binned_df'] = input_df_var.value.copy(deep=True)

with st.expander('Set subject identifier', expanded=False):
identifier = st.radio('Subject identifier', options=['Row number', 'ID column'])
identifier = st.radio('Subject identifier', options=['Row number', 'ID column'], help='Select row number if each row of data represents a distinct individual, otherwise select ID column to link multiple rows to the same individual via their ID.')
if identifier == 'ID column':
options = ['']+list(input_df_var.value.columns.values)
identifier_col = st.selectbox('Select subject identifier column', options=options)
Expand Down
12 changes: 6 additions & 6 deletions app/workflows/attribute_patterns/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def prepare_graph(sv, mi=False):
def generate_embedding(sv, df, time_to_graph):
period_embeddings = {}
node_list = sorted(df['Full Attribute'].unique().tolist())
print(node_list)
sorted_att_types = sorted(df['Attribute Type'].unique())
node_to_ix = {n : i for i, n in enumerate(node_list)}
node_to_label = {n : sorted_att_types.index(n.split(config.type_val_sep)[0]) for n in node_list}
Expand Down Expand Up @@ -230,7 +229,6 @@ def detect_patterns(sv):
for (a, b) in period_pairs:
if len(pattern) > 0 and ((a in pattern and b in pattern) or (a not in pattern and b not in pattern)):
continue
# print(f'checking {a} and {b} in {pattern}')
candidate = None
if (a in pattern and b not in pattern):
candidate = [b]
Expand All @@ -253,7 +251,6 @@ def detect_patterns(sv):
if pcount > sv.attribute_min_pattern_count.value:
period_to_patterns[period].append((candidate_pattern, pcount))
pattern_to_periods[tuple(candidate_pattern)].add(period)
# print(f'In {period} added {candidate_pattern} with count {pcount}')
print('done combining pairs')
# convert to df
pattern_rows = []
Expand All @@ -263,11 +260,14 @@ def detect_patterns(sv):
mean, sd, mx = rc.compute_period_mean_sd_max(pattern)
score = (count - mean) / sd
if score >= 0:
row = [period, ' & '.join(pattern), len(pattern), len(pattern_to_periods[tuple(pattern)]), count, round(mean, 0), round(score, 2)]
row = [period, ' & '.join(pattern), len(pattern), count, round(mean, 0), round(score, 2)]
pattern_rows.append(row)
columns = ['period', 'pattern', 'length', 'detections', 'count', 'mean', 'z_score']
columns = ['period', 'pattern', 'length', 'count', 'mean', 'z_score']
pattern_df = pd.DataFrame(pattern_rows, columns=columns)
pattern_df['overall_score'] = pattern_df['z_score'] * pattern_df['length'] * np.log(pattern_df['count']) * pattern_df['detections']
# count number of periods per pattern
detections = pattern_df.groupby('pattern')['period'].count().reset_index().rename(columns={'period' : 'detections'})
pattern_df = pattern_df.merge(detections, on='pattern')
pattern_df['overall_score'] = pattern_df['z_score'] * pattern_df['length'] * pattern_df['detections'] * np.log(pattern_df['count'] + 1)
# normalize overall score
max_score = pattern_df['overall_score'].max()
pattern_df['overall_score'] = pattern_df['overall_score'].apply(lambda x: round((x) / (max_score), 2))
Expand Down
3 changes: 2 additions & 1 deletion app/workflows/attribute_patterns/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def create():
pdf = pd.melt(pdf, id_vars=['Subject ID', 'Period'], value_vars=att_cols, var_name='Attribute Type', value_name='Attribute Value')
pdf = pdf[pdf['Attribute Value'] != '']
pdf['Full Attribute'] = pdf.apply(lambda x: str(x['Attribute Type']) + config.type_val_sep + str(x['Attribute Value']), axis=1)
pdf = pdf[pdf['Period'] != '']
sv.attribute_dynamic_df.value = pdf
if ready and len(sv.attribute_dynamic_df.value) > 0:
st.markdown(f'Graph model has **{len(sv.attribute_dynamic_df.value)}** links spanning **{len(sv.attribute_dynamic_df.value["Subject ID"].unique())}** cases, **{len(sv.attribute_dynamic_df.value["Full Attribute"].unique())}** attributes, and **{len(sv.attribute_dynamic_df.value["Period"].unique())}** periods.')
Expand Down Expand Up @@ -88,7 +89,7 @@ def create():
period_count = len(sv.attribute_pattern_df.value["period"].unique())
pattern_count = len(sv.attribute_pattern_df.value)
unique_count = len(sv.attribute_pattern_df.value['pattern'].unique())
st.markdown(f'Over **{period_count}** periods, detected **{pattern_count}** attribute patterns (**{unique_count}** unique) from **{sv.attribute_converging_pairs.value}**/**{sv.attribute_all_pairs.value}** converging attribute pairs (**{round(sv.attribute_converging_pairs.value / sv.attribute_all_pairs.value * 100, 2) if sv.attribute_all_pairs.value > 0 else 0}%**).')
st.markdown(f'Over **{period_count}** periods, detected **{pattern_count}** attribute patterns (**{unique_count}** unique) from **{sv.attribute_converging_pairs.value}**/**{sv.attribute_all_pairs.value}** converging attribute pairs (**{round(sv.attribute_converging_pairs.value / sv.attribute_all_pairs.value * 100, 2) if sv.attribute_all_pairs.value > 0 else 0}%**). Patterns ranked by ```overall_score = normalize(length * ln(count) * z_score * detections)```.')
show_df = sv.attribute_pattern_df.value
tdf = functions.create_time_series_df(sv.attribute_record_counter.value, sv.attribute_pattern_df.value)
gb = GridOptionsBuilder.from_dataframe(show_df)
Expand Down
13 changes: 4 additions & 9 deletions app/workflows/question_answering/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
)

def chunk_files(sv, files):
pb = st.progress(0, 'Chunking and embedding files...')
pb = st.progress(0, 'Chunking files...')
file_chunks = []
for file_link in files:
for fx, file_link in enumerate(files):
pb.progress((fx+1) / len(files), f'Chunking file {fx+1} of {len(files)}...')
file_names = [f.name for f in sv.answering_files.value.values()]
doc_text = ''
if file_link.name not in file_names:
Expand All @@ -32,13 +33,7 @@ def chunk_files(sv, files):
file = classes.File(file_link.name, file_id)
sv.answering_files.value[file_id] = file
bytes = file_link.getvalue()
# path = f'{config.cache_dir}\\qa_mine\\raw_files'
# if not os.path.exists(path):
# os.makedirs(path)
# path = os.path.join(path, file.name)

# with open(path, 'wb') as f:
# f.write(bytes)
pdf_reader = pdfplumber.open(io.BytesIO(bytes))
for px in range(len(pdf_reader.pages)):
page_text = pdf_reader.pages[px].extract_text()
Expand Down Expand Up @@ -70,7 +65,7 @@ def update_question(sv, question_history, new_questions, placeholder, prefix):
system_message = """\
You are a helpful assistant augmenting a user question with any relevant keywords (e.g., entities, concepts, or knowledge) found in a list of input questions, each of which is prefixed by the question ID.
Any relevant keywords should be inserted as a list, enclosed by parentheses, at the appropriate point in the question, with each keywords item referencing the supporting question IDs using "(<keywords 1> [Q<ID>, Q<ID>...], <keywords 2> [Q<ID>, Q<ID>...], ...)".
Any keywords that directly help answer the question should be inserted as a list, enclosed by parentheses, at the appropriate point in the question, with each keywords item referencing the supporting question IDs using "(<keywords 1> [Q<ID>, Q<ID>...], <keywords 2> [Q<ID>, Q<ID>...], ...)".
Do not insert any text indicating lack of relevant keywords, and do not remove any text (including question references) already present in the previous augmented question unless it is clearly irrelevant.
Expand Down
26 changes: 11 additions & 15 deletions app/workflows/risk_networks/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,22 +202,18 @@ def create():
st.multiselect('Select node types to fuzzy match', options=sorted([config.entity_label] + list(sv.network_node_types.value)), key=sv.network_indexed_node_types.key)
if st.button('Index nodes', disabled=len(sv.network_indexed_node_types.value) == 0):

with st.spinner('Indexing nodes...'):

text_types = list([(n, d['type']) for n, d in sv.network_overall_graph.value.nodes(data=True) if d['type'] in sv.network_indexed_node_types.value])
texts = [t[0] for t in text_types]

df = pd.DataFrame(text_types, columns=['text', 'type'])
embeddings = embedder.encode_all(texts)
vals = [(n, t, e) for (n, t), e in zip(text_types, embeddings)]
edf = pd.DataFrame(vals, columns=['text', 'type', 'vector'])
text_types = list([(n, d['type']) for n, d in sv.network_overall_graph.value.nodes(data=True) if d['type'] in sv.network_indexed_node_types.value])
texts = [t[0] for t in text_types]

df = pd.DataFrame(text_types, columns=['text', 'type'])
embeddings = embedder.encode_all(texts)
vals = [(n, t, e) for (n, t), e in zip(text_types, embeddings)]
edf = pd.DataFrame(vals, columns=['text', 'type', 'vector'])

edf = edf[edf['text'].isin(texts)]
sv.network_embedded_texts.value = edf['text'].tolist()
# edf['vector'] = edf['vector'].apply(lambda x : np.array([np.float32(y) for y in x[1:-1].split(' ') if y != '']))
# embeddings = np.array(edf['vector'].tolist())
nbrs = NearestNeighbors(n_neighbors=20, n_jobs=1, algorithm='auto', leaf_size=20, metric='cosine').fit(embeddings)
sv.network_nearest_text_distances.value, sv.network_nearest_text_indices.value = nbrs.kneighbors(embeddings)
edf = edf[edf['text'].isin(texts)]
sv.network_embedded_texts.value = edf['text'].tolist()
nbrs = NearestNeighbors(n_neighbors=20, n_jobs=1, algorithm='auto', leaf_size=20, metric='cosine').fit(embeddings)
sv.network_nearest_text_distances.value, sv.network_nearest_text_indices.value = nbrs.kneighbors(embeddings)
st.markdown(f'*Number of nodes indexed*: {len(sv.network_embedded_texts.value)}')
st.markdown('##### Infer links')
st.number_input('Similarity threshold', min_value=0.0, max_value=1.0, key=sv.network_similarity_threshold.key, step=0.01, value=sv.network_similarity_threshold.value)
Expand Down

0 comments on commit f1359b1

Please sign in to comment.