From 3b377f5824c771464f0b89089c539cbab3c4db79 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 15:46:58 -0300
Subject: [PATCH 1/6] add protected mode on graph in RN

---
 app/workflows/record_matching/workflow.py |  30 +++---
 app/workflows/risk_networks/functions.py  |   6 +-
 app/workflows/risk_networks/variables.py  |   3 +-
 app/workflows/risk_networks/workflow.py   | 117 ++++++++++++++++++----
 4 files changed, 118 insertions(+), 38 deletions(-)

diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index c0cd8575..caf6cf47 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -296,24 +296,21 @@ def att_ui(i):
                         sv.matching_matches_df.value = sv.matching_matches_df.value.sort(by=['Name similarity', 'Group ID'], descending=[False, False])
                         # # keep all records linked to a group ID if any record linked to that ID has dataset GD or ILM
                         # sv.matching_matches_df.value = sv.matching_matches_df.value.filter(pl.col('Group ID').is_in(sv.matching_matches_df.value.filter(pl.col('Dataset').is_in(['GD', 'ILM']))['Group ID'].unique()))
-                        data = sv.matching_matches_df.value
-                        unique_names = data['Entity name'].unique()
-                        #verify if the names are already in this format: Entity_1, Entity_2, etc
-                        pattern = f'^Entity_\d+$'
-                        matches = unique_names.str.contains(pattern)
-                        all_matches = matches.all()
-                        if not all_matches and sv_home.protected_mode.value:
-                            for i, name in enumerate(unique_names, start=1):
-                                data = data.with_columns(data['Entity name'].replace(name, 'Entity_{}'.format(i)))
-                                sv.matching_matches_df.value = data
+                        
                         st.rerun()
                 if len(sv.matching_matches_df.value) > 0:
                     st.markdown(f'Identified **{len(sv.matching_matches_df.value)}** record groups.')
             with c2:
+                data = sv.matching_matches_df.value
                 st.markdown('##### Record groups')
                 if len(sv.matching_matches_df.value) > 0:
-                    st.dataframe(sv.matching_matches_df.value, height=700, use_container_width=True, hide_index=True)
-                    st.download_button('Download record groups', data=sv.matching_matches_df.value.write_csv(), file_name='record_groups.csv', mime='text/csv')
+                    if sv_home.protected_mode.value:
+                        unique_names = sv.matching_matches_df.value['Entity name'].unique()
+                        for i, name in enumerate(unique_names, start=1):
+                            data = data.with_columns(data['Entity name'].replace(name, 'Entity_{}'.format(i)))
+
+                    st.dataframe(data, height=700, use_container_width=True, hide_index=True)
+                    st.download_button('Download record groups', data=data.write_csv(), file_name='record_groups.csv', mime='text/csv')
                            
     with evaluate_tab:
         b1, b2 = st.columns([2, 3])
@@ -336,7 +333,15 @@ def att_ui(i):
                     response = util.AI_API.generate_text_from_message_list(messages, placeholder, prefix=prefix)
                     if len(response.strip()) > 0:
                         prefix = prefix + response + '\n'
+
                 result = prefix.replace('```\n', '').strip()
+
+                if sv_home.protected_mode.value:
+                    unique_names = sv.matching_matches_df.value['Entity name'].unique()
+                    for i, name in enumerate(unique_names, start=1):
+                        #search for unique_names in result and change for its Entity_
+                        result = result.replace(name, 'Entity_{}'.format(i))
+
                 sv.matching_evaluations.value = pl.read_csv(io.StringIO(result))
 
                 validation, messages_to_llm = util.ui_components.validate_ai_report(messages, sv.matching_evaluations.value)
@@ -347,6 +352,7 @@ def att_ui(i):
                 if len(sv.matching_evaluations.value) == 0:
                     gen_placeholder.warning('Press the Generate button to create an AI report for the current record matches.')
             placeholder.empty()
+
             if len(sv.matching_evaluations.value) > 0:
                 st.dataframe(sv.matching_evaluations.value.to_pandas(), height=700, use_container_width=True, hide_index=True)
                 jdf = sv.matching_matches_df.value.join(sv.matching_evaluations.value, on='Group ID', how='inner')
diff --git a/app/workflows/risk_networks/functions.py b/app/workflows/risk_networks/functions.py
index 844f58ce..6687a405 100644
--- a/app/workflows/risk_networks/functions.py
+++ b/app/workflows/risk_networks/functions.py
@@ -4,7 +4,7 @@
 import colorsys
 import numpy as np
 from collections import defaultdict
-from streamlit_agraph import Config, Edge, Node, agraph
+from streamlit_agraph import Config, Edge, Node
 
 import workflows.risk_networks.config as config
 
@@ -61,6 +61,7 @@ def get_type_color(node_type, is_flagged, attribute_types):
             comm = G.nodes[node]['network'] if 'network' in G.nodes[node] else ''
             label = '\n'.join(vals) + '\n(' + config.list_sep.join(atts) + ')'
             d_risk = G.nodes[node]['flags']
+
             nodes.append(
                 Node(
                     title=node + f'\nFlags: {d_risk}',
@@ -82,8 +83,7 @@ def get_type_color(node_type, is_flagged, attribute_types):
         physics=True,
         hierarchical=False
     )
-    return_value = agraph(nodes=nodes, edges=edges, config=g_config) # type: ignore
-    return return_value
+    return nodes, edges, g_config # type: ignore
 
 def merge_nodes(G, can_merge_fn):
     nodes = list(G.nodes()) # may change during iteration
diff --git a/app/workflows/risk_networks/variables.py b/app/workflows/risk_networks/variables.py
index f58ff9fd..1967c22d 100644
--- a/app/workflows/risk_networks/variables.py
+++ b/app/workflows/risk_networks/variables.py
@@ -57,4 +57,5 @@ def __init__(self, prefix):
         self.network_mean_flagged_flags = SessionVariable(0, prefix)
         self.network_risk_exposure = SessionVariable('', prefix)
         self.network_last_show_entities = SessionVariable(False, prefix)
-        self.network_last_show_groups = SessionVariable(False, prefix)
\ No newline at end of file
+        self.network_last_show_groups = SessionVariable(False, prefix)
+        self.network_attributes_protected = SessionVariable([], prefix)
\ No newline at end of file
diff --git a/app/workflows/risk_networks/workflow.py b/app/workflows/risk_networks/workflow.py
index 6ae59f2c..d139cf44 100644
--- a/app/workflows/risk_networks/workflow.py
+++ b/app/workflows/risk_networks/workflow.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 import json
+import numpy as np
 import streamlit as st
 import pandas as pd
 import networkx as nx
@@ -7,6 +8,7 @@
 from collections import defaultdict
 from sklearn.neighbors import NearestNeighbors
 from util.session_variables import SessionVariables
+from streamlit_agraph import agraph
 
 import re
 import os
@@ -85,20 +87,17 @@ def create():
                     if st.button("Add links to model", disabled=entity_col == '' or attribute_col == '' or len(value_cols) == 0 or link_type == ''):
                         with st.spinner('Adding links to model...'):
                             for value_col in value_cols:
-                                if attribute_col == 'Use column name':
-                                    attribute_label = value_col
                                 # remove punctuation but retain characters and digits in any language
                                 # compress whitespace to single space
-                                if sv_home.protected_mode.value:
-                                    for index, row in df.iterrows():
-                                        df.at[index, entity_col] = f'Entity_{index}'
-                                else:
-                                    df[entity_col] = df[entity_col].apply(lambda x : re.sub(r'[^\w\s&@\+]', '', str(x)).strip())
-                                    df[entity_col] = df[entity_col].apply(lambda x : re.sub(r'\s+', ' ', str(x)).strip())
-                                
+                                df[entity_col] = df[entity_col].apply(lambda x : re.sub(r'[^\w\s&@\+]', '', str(x)).strip())
+                                df[entity_col] = df[entity_col].apply(lambda x : re.sub(r'\s+', ' ', str(x)).strip())
                                 df[value_col] = df[value_col].apply(lambda x : re.sub(r'[^\w\s&@\+]', '', str(x)).strip())
                                 df[value_col] = df[value_col].apply(lambda x : re.sub(r'\s+', ' ', str(x)).strip())
                                 df[value_col] = df[value_col].apply(lambda x : re.sub(r'\s+', ' ', str(x)).strip())
+
+                                if attribute_col == 'Use column name':
+                                    attribute_label = value_col
+
                                 if link_type == 'Entity-Attribute':
                                     if attribute_col in ['Use column name', 'Use custom name']:
                                         df['attribute_col'] = attribute_label
@@ -254,6 +253,14 @@ def create():
             search = st.text_input('Search for attributes to remove', '')
             if search != '':
                 adf = adf[adf['Attribute'].str.contains(search, case=False)]
+
+            if sv_home.protected_mode.value:
+                unique_names = adf['Attribute'].unique()
+                for i, name in enumerate(unique_names, start=1):
+                    #get first part of == and remove any whitespace
+                    name_format = name.split('==')[0].strip()
+                    adf['Attribute'] = adf['Attribute'].apply(lambda x: f'{name_format}=={name_format}_{str(i)}' if name == x else x)
+                sv.network_attributes_protected.value = [(unique_names),(adf['Attribute'].tolist())]
             selected_rows = util.ui_components.dataframe_with_selections(adf, sv.network_additional_trimmed_attributes.value, 'Attribute', 'Remove', key='remove_attribute_table')
             sv.network_additional_trimmed_attributes.value = selected_rows['Attribute'].tolist()
             c1, c2, c3 = st.columns([1, 1, 1])
@@ -329,10 +336,18 @@ def create():
                 trimmed_atts = len(sv.network_trimmed_attributes.value)
                 st.markdown(f'*Networks identified: {comm_count} ({len(comm_sizes)} with multiple entities, maximum {max_comm_size})*')
                 st.markdown(f'*Attributes removed because of high degree*: {trimmed_atts}')
+                
+                if sv_home.protected_mode.value:
+                    adf = pd.DataFrame(sv.network_attributes_list.value, columns=['Attribute'])
+                    data = sv.network_trimmed_attributes.value.copy()
+                    unique_names = adf['Attribute'].unique()
+                    for i, name in enumerate(unique_names, start=1):
+                        name_format = name.split('==')[0].strip()
+                        data['Attribute'] = data['Attribute'].apply(lambda x: f'{name_format}=={name_format}_{str(i)}' if name == x else x)
+                else:
+                    data = sv.network_trimmed_attributes.value
                 if trimmed_atts > 0:
-                    st.dataframe(sv.network_trimmed_attributes.value, hide_index=True, use_container_width=True)
-
-
+                    st.dataframe(data, hide_index=True, use_container_width=True)
             
     with view_tab:
         if len(sv.network_entity_df.value) == 0:
@@ -346,8 +361,14 @@ def create():
                     show_groups = st.checkbox('Show groups', value = sv.network_last_show_groups.value)
                 with b3:
                     dl_button = st.empty()
-                show_df = sv.network_entity_df.value.copy()
-                
+
+                if sv_home.protected_mode.value:
+                    show_df = sv.network_entity_df.value.copy()
+                    for i, name in enumerate(show_df['Entity ID'], start=1):
+                        show_df['Entity ID'] = show_df['Entity ID'].apply(lambda x: f'Entity ID_{str(i)}' if name == x else x)
+                else:
+                    show_df = sv.network_entity_df.value.copy()
+                    
                 if show_groups != sv.network_last_show_groups.value:
                     sv.network_last_show_groups.value = show_groups
                     sv.network_table_index.value += 1
@@ -363,16 +384,17 @@ def create():
                         # Use group values as columns with values in them
                         df = df.pivot_table(index='Entity ID', columns='Group', values='Value', aggfunc='first').reset_index()
                         show_df = show_df.merge(df, on='Entity ID', how='left')
+                last_df = show_df.copy()
                 if not show_entities:
-                    show_df = show_df.drop(columns=['Entity ID', 'Entity Flags']).drop_duplicates().reset_index(drop=True)
-                dl_button.download_button('Download network data', show_df.to_csv(index=False), 'network_data.csv', 'Download network data')
-                gb = GridOptionsBuilder.from_dataframe(show_df)
+                    last_df = last_df.drop(columns=['Entity ID', 'Entity Flags']).drop_duplicates().reset_index(drop=True)
+                dl_button.download_button('Download network data', last_df.to_csv(index=False), 'network_data.csv', 'Download network data')
+                gb = GridOptionsBuilder.from_dataframe(last_df)
                 gb.configure_default_column(flex=1, wrapText=True, wrapHeaderText=True, enablePivot=False, enableValue=False, enableRowGroup=False)
                 gb.configure_selection(selection_mode="single", use_checkbox=False)
                 gb.configure_side_bar()
                 gridoptions = gb.build()
                 response = AgGrid(
-                    show_df,
+                    last_df,
                     key=f'report_grid_{sv.network_table_index.value}',
                     height=400,
                     gridOptions=gridoptions,
@@ -383,7 +405,7 @@ def create():
                     header_checkbox_selection_filtered_only=False,
                     use_checkbox=False,
                     enable_quicksearch=True,
-                    reload_data=False,
+                    reload_data=True,
                     columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW
                     )
            
@@ -466,13 +488,63 @@ def create():
                             gp.markdown(f'##### Entity {selected_entity} in Network {selected_network} (full)')
                         else:
                             gp.markdown(f'##### Network {selected_network} (full)')
-                        functions.get_entity_graph(N, f'{config.entity_label}{config.att_val_sep}{selected_entity}', full_links_df, 1000, 700, [config.entity_label] + list(sv.network_node_types.value))
+
+                        nodes, edges, g_config = functions.get_entity_graph(N, f'{config.entity_label}{config.att_val_sep}{selected_entity}', full_links_df, 1000, 700, [config.entity_label] + list(sv.network_node_types.value))
+                        if sv_home.protected_mode.value:
+                            new_nodes = []
+                            all_nodes = set(full_links_df["source"]).union(set(full_links_df["target"]))
+                            original = sv.network_attributes_protected.value[0]
+                            new = sv.network_attributes_protected.value[1]
+                            entities_new = sv.network_entity_df.value.copy()
+
+                            for node in nodes:
+                                sources = full_links_df["source"].tolist()
+                                targets = full_links_df["target"].tolist()
+
+                                for li in sources:
+                                    lia = li.split('==')
+                                    if lia[1] in node.label:
+                                        raw_label = '('+node.label.split('(')[1]
+                                        found_indices = [index for index, x in enumerate(original) if lia[1] in x]
+                                        if len(found_indices) == 0:
+                                            found_indices = [index for index, x in enumerate(entities_new['Entity ID'].tolist()) if lia[1] in x]
+                                            if len(found_indices) > 0:
+                                                node.label = show_df['Entity ID'].tolist()[found_indices[0]]+'\n'+raw_label
+                                                flags = node.title.split('\n')[1]
+                                                node.title = show_df['Entity ID'].tolist()[found_indices[0]]+'\n'+flags
+                                        elif len(found_indices) > 0:
+                                            node.label = new[found_indices[0]]+'\n'+raw_label
+                                            flags = node.title.split('\n')[1]
+                                            node.title = new[found_indices[0]]+'\n'+flags
+                                        new_nodes.append(node)
+
+                                for li in targets:
+                                    lia = li.split('==')
+                                    if lia[1] in node.label:
+                                        raw_label = '('+node.label.split('(')[1]
+                                        found_indices = [index for index, x in enumerate(original) if lia[1] in x]
+                                        if len(found_indices) == 0:
+                                            found_indices = [index for index, x in enumerate(entities_new['Entity ID'].tolist()) if lia[1] in x]
+                                            if len(found_indices) > 0:
+                                                node.label = show_df['Entity ID'].tolist()[found_indices[0]]+'\n'+raw_label
+                                                flags = node.title.split('\n')[1]
+                                                node.title = show_df['Entity ID'].tolist()[found_indices[0]]+'\n'+flags
+                                        elif len(found_indices) > 0:
+                                                node.label = new[found_indices[0]]+'\n'+raw_label
+                                                flags = node.title.split('\n')[1]
+                                                node.title = new[found_indices[0]]+'\n'+flags
+                                        new_nodes.append(node)
+                        else:
+                            new_nodes = nodes
+
+                        agraph(nodes=new_nodes, edges=edges, config=g_config) # type: ignore
                     elif graph_type == 'Simplified':
                         if selected_entity != '':
                             gp.markdown(f'##### Entity {selected_entity} in Network {selected_network} (simplified)')
                         else:
                             gp.markdown(f'##### Network {selected_network} (simplified)')
-                        functions.get_entity_graph(N1, f'{config.entity_label}{config.att_val_sep}{selected_entity}', merged_links_df, 1000, 700, [config.entity_label] + list(sv.network_node_types.value))
+                        nodes, edges, g_config =  functions.get_entity_graph(N1, f'{config.entity_label}{config.att_val_sep}{selected_entity}', merged_links_df, 1000, 700, [config.entity_label] + list(sv.network_node_types.value))
+                        agraph(nodes=nodes, edges=edges, config=g_config) # type: ignore
                 sv.network_merged_links_df.value = merged_links_df
                 sv.network_merged_nodes_df.value = merged_nodes_df
             else:
@@ -525,11 +597,12 @@ def create():
                     if len(sv.network_report.value) == 0:
                         gen_placeholder.warning('Press the Generate button to create an AI report for the current network.')
                 report_data = sv.network_report.value
+
                 report_placeholder.markdown(report_data)
 
                 util.ui_components.report_download_ui(sv.network_report, 'network_report')
 
-                if sv.network_report_validation.value != {}:
+                if sv.network_report_validation.value != {} and len(sv.network_report.value) > 0:
                         if generated:
                             validation_status.update(label=f"LLM faithfulness score: {sv.network_report_validation.value['score']}/5", state='complete')
                         else:

From 3835057df8af0e557b14afa462469111ea453c94 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 15:49:40 -0300
Subject: [PATCH 2/6] add fix protected mode on RM

---
 app/workflows/record_matching/workflow.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index 4189ae15..fdbe1c6b 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -335,6 +335,12 @@ def att_ui(i):
                         prefix = prefix + response + '\n'
 
                 result = prefix.replace('```\n', '').strip()
+
+                if sv_home.protected_mode.value:
+                    unique_names = sv.matching_matches_df.value['Entity name'].unique()
+                    for i, name in enumerate(unique_names, start=1):
+                        result = result.replace(name, 'Entity_{}'.format(i))
+
                 sv.matching_evaluations.value = pl.read_csv(io.StringIO(result), read_csv_options={"truncate_ragged_lines": True})
 
                 validation, messages_to_llm = util.ui_components.validate_ai_report(messages, sv.matching_evaluations.value)

From 7c8b3ed1b442ebceda3b1388ee201c8a8abee30b Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 15:58:43 -0300
Subject: [PATCH 3/6] fix truncate lines

---
 app/workflows/record_matching/workflow.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index fdbe1c6b..594488d5 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -341,7 +341,8 @@ def att_ui(i):
                     for i, name in enumerate(unique_names, start=1):
                         result = result.replace(name, 'Entity_{}'.format(i))
 
-                sv.matching_evaluations.value = pl.read_csv(io.StringIO(result), read_csv_options={"truncate_ragged_lines": True})
+                csv = pl.read_csv(io.StringIO(result))
+                sv.matching_evaluations.value = csv.drop_nulls()
 
                 validation, messages_to_llm = util.ui_components.validate_ai_report(messages, sv.matching_evaluations.value)
                 sv.matching_report_validation.value = json.loads(validation)
@@ -366,6 +367,6 @@ def att_ui(i):
                         obj = json.dumps({
                             "message": sv.matching_report_validation_messages.value,
                             "result": sv.matching_report_validation.value,
-                            "report": sv.matching_evaluations.value
+                            "report": pd.DataFrame(sv.matching_evaluations.value).to_json()
                         }, indent=4)
                         st.download_button('Download faithfulness evaluation', use_container_width=True, data=str(obj), file_name=f'matching_{get_current_time}_messages.json', mime='text/json')

From b2f4e9fe0379adae8ef926d542ef88d88bf99446 Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 16:06:37 -0300
Subject: [PATCH 4/6] when generating new matching on RM, clear  report

---
 app/workflows/record_matching/workflow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index 594488d5..e67c6b48 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -144,6 +144,7 @@ def att_ui(i):
                     name_similarity = st.number_input('Matching name similarity (min)', min_value=0.0, max_value=1.0, step=0.01, value=sv.matching_sentence_pair_jaccard_threshold.value, help='The minimum Jaccard similarity between the character trigrams of the names of two records for them to be considered a match. Higher values will result in fewer closer name matches.')
 
                 if st.button('Detect record groups', use_container_width=True):
+                    sv.matching_evaluations.value = pl.DataFrame()
                     if record_distance != sv.matching_sentence_pair_embedding_threshold.value:
                         sv.matching_sentence_pair_embedding_threshold.value = record_distance
                     if name_similarity != sv.matching_sentence_pair_jaccard_threshold.value:

From aef8e2ecdc74759a28f2e8adfcadfd891e7c7fae Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 16:19:33 -0300
Subject: [PATCH 5/6] reset report validation with reset of report

---
 app/workflows/attribute_patterns/workflow.py | 1 +
 app/workflows/question_answering/workflow.py | 1 +
 app/workflows/record_matching/workflow.py    | 1 +
 app/workflows/risk_networks/workflow.py      | 3 ++-
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/app/workflows/attribute_patterns/workflow.py b/app/workflows/attribute_patterns/workflow.py
index f05602df..7165d5d6 100644
--- a/app/workflows/attribute_patterns/workflow.py
+++ b/app/workflows/attribute_patterns/workflow.py
@@ -132,6 +132,7 @@ def create():
                         sv.attribute_selected_pattern.value = selected_pattern
                         sv.attribute_selected_pattern_period.value = selected_pattern_period
                         sv.attribute_report.value = ''
+                        sv.attribute_report_validation.value = {}
                         st.rerun()
 
                     st.markdown('**Selected pattern: ' + selected_pattern + ' (' + selected_pattern_period + ')**')
diff --git a/app/workflows/question_answering/workflow.py b/app/workflows/question_answering/workflow.py
index 22d4445c..5f74fa9d 100644
--- a/app/workflows/question_answering/workflow.py
+++ b/app/workflows/question_answering/workflow.py
@@ -64,6 +64,7 @@ def create():
             sv.answering_next_q_id.value = 1
             sv.answering_surface_questions.value = {}
             sv.answering_deeper_questions.value = {}
+            sv.answering_report_validation.value = {}
             sv.answering_target_matches.value = answering_target_matches
             sv.answering_source_diversity.value = answering_source_diversity
             sv.answering_last_lazy_question.value = question
diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index e67c6b48..6fc28388 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -145,6 +145,7 @@ def att_ui(i):
 
                 if st.button('Detect record groups', use_container_width=True):
                     sv.matching_evaluations.value = pl.DataFrame()
+                    sv.matching_report_validation.value = {}
                     if record_distance != sv.matching_sentence_pair_embedding_threshold.value:
                         sv.matching_sentence_pair_embedding_threshold.value = record_distance
                     if name_similarity != sv.matching_sentence_pair_jaccard_threshold.value:
diff --git a/app/workflows/risk_networks/workflow.py b/app/workflows/risk_networks/workflow.py
index a43028b4..4e8d695a 100644
--- a/app/workflows/risk_networks/workflow.py
+++ b/app/workflows/risk_networks/workflow.py
@@ -416,6 +416,7 @@ def create():
                 sv.network_selected_entity.value = selected_entity
                 sv.network_selected_community.value = selected_network
                 sv.network_report.value = ''
+                sv.network_report_validation.value = {}
                 c_nodes = sv.network_community_nodes.value[selected_network]
                 N = functions.build_network_from_entities(sv, sv.network_overall_graph.value, c_nodes)
                 if selected_entity != '':
@@ -602,7 +603,7 @@ def create():
 
                 util.ui_components.report_download_ui(sv.network_report, 'network_report')
 
-                if sv.network_report_validation.value != {} and len(sv.network_report.value) > 0:
+                if sv.network_report_validation.value != {}:
                         if generated:
                             validation_status.update(label=f"LLM faithfulness score: {sv.network_report_validation.value['score']}/5", state='complete')
                         else:

From 42738454859c2d338330e31e9597e5ee797386ae Mon Sep 17 00:00:00 2001
From: Dayenne Souza <ddesouza@microsoft.com>
Date: Wed, 17 Apr 2024 16:29:19 -0300
Subject: [PATCH 6/6] add 30 samples of record matching to validate

---
 app/workflows/record_matching/workflow.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/app/workflows/record_matching/workflow.py b/app/workflows/record_matching/workflow.py
index 6fc28388..8be90947 100644
--- a/app/workflows/record_matching/workflow.py
+++ b/app/workflows/record_matching/workflow.py
@@ -346,7 +346,12 @@ def att_ui(i):
                 csv = pl.read_csv(io.StringIO(result))
                 sv.matching_evaluations.value = csv.drop_nulls()
 
-                validation, messages_to_llm = util.ui_components.validate_ai_report(messages, sv.matching_evaluations.value)
+                #get 30 random dows to evaluate
+                data_to_validate = sv.matching_evaluations.value
+                if len(sv.matching_evaluations.value) > 30:
+                    data_to_validate = sv.matching_evaluations.value.sample(n=30)
+
+                validation, messages_to_llm = util.ui_components.validate_ai_report(messages, data_to_validate)
                 sv.matching_report_validation.value = json.loads(validation)
                 sv.matching_report_validation_messages.value = messages_to_llm
                 st.rerun()