add pre-processing steps export and import

microsoft · Nov 8, 2024 · 3553a8f · 3553a8f
1 parent 6f864ae
commit 3553a8f
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 26 deletions.
diff --git a/app/workflows/query_text_data/workflow.py b/app/workflows/query_text_data/workflow.py
@@ -63,19 +63,28 @@ async def create(sv: SessionVariables, workflow=None):
         )
     with uploader_tab:
         st.markdown("##### Upload data for processing")
-        files = st.file_uploader(
-            "Upload PDF text files",
-            type=["pdf", "txt", "json", "csv"],
-            accept_multiple_files=True,
-            key="qtd_uploader_" + st.session_state[f"{workflow}_uploader_index"],
+
+        upload_type = st.radio(
+            "Upload type",
+            options=["Raw files", "Processed data"],
+            key=f"{workflow}_data_source",
         )
-        # window_size = st.selectbox(
-        #     "Analysis time window",
-        #     key=sv.analysis_window_size.key,
-        #     options=[str(x) for x in input_processor.PeriodOption._member_names_],
-        # )
-        # window_period = input_processor.PeriodOption[window_size]
-        # window_period = input_processor.PeriodOption.NONE
+        files = None
+        file_chunks = None
+        if upload_type == "Raw files":
+            files = st.file_uploader(
+                "Upload PDF text files",
+                type=["pdf", "txt", "json", "csv"],
+                accept_multiple_files=True,
+                key="qtd_uploader_" + st.session_state[f"{workflow}_uploader_index"],
+            )
+        else:
+            file_chunks = st.file_uploader(
+                "Upload processed files",
+                type=["csv"],
+                key="chunk_uploader_" + st.session_state[f"{workflow}_uploader_index"],
+            )
+
         local_embedding = st.toggle(
             "Use local embeddings",
             key=sv.answer_local_embedding_enabled.key,
@@ -84,28 +93,32 @@ async def create(sv: SessionVariables, workflow=None):
         )
         qtd.set_embedder(embedder.create_embedder(local_embedding))
 
-        if files is not None and st.button("Process files"):
+        if st.button("Process files") and (
+            files is not None or file_chunks is not None
+        ):
             qtd.reset_workflow()
-
-            file_pb, file_callback = functions.create_progress_callback(
-                "Loaded {} of {} files..."
-            )
-            qtd.process_data_from_files(
-                input_file_bytes={file.name: file.getvalue() for file in files},
-                callbacks=[file_callback],
-            )
+            if upload_type == "Raw files":
+                file_pb, file_callback = functions.create_progress_callback(
+                    "Loaded {} of {} files..."
+                )
+                qtd.process_data_from_files(
+                    input_file_bytes={file.name: file.getvalue() for file in files},
+                    callbacks=[file_callback],
+                )
+                file_pb.empty()
+            else:
+                qtd.import_chunks_from_str(file_chunks)
 
             chunk_pb, chunk_callback = functions.create_progress_callback(
                 "Processed {} of {} chunks..."
             )
             qtd.process_text_chunks(callbacks=[chunk_callback])
-            
+
             embed_pb, embed_callback = functions.create_progress_callback(
                 "Embedded {} of {} text chunks..."
             )
             await qtd.embed_text_chunks(callbacks=[embed_callback])
             chunk_pb.empty()
-            file_pb.empty()
             embed_pb.empty()
             st.rerun()
 
@@ -121,6 +134,16 @@ async def create(sv: SessionVariables, workflow=None):
                 message += "."
             message = message.replace("**1** periods", "**1** period")
             st.success(message)
+
+        if qtd.label_to_chunks and upload_type == "Raw files":
+            st.download_button(
+                label="Download processed data",
+                help="Download the processed data as a CSV file to reuse later.",
+                data=qtd.get_chunks_as_df().to_csv(),
+                file_name=f"processed_data_{len(qtd.label_to_chunks)}_query_text.csv",
+                mime="text/csv",
+            )
+
     with graph_tab:
         if qtd.stage.value < QueryTextDataStage.CHUNKS_PROCESSED.value:
             st.warning("Process files to continue.")

diff --git a/intelligence_toolkit/query_text_data/api.py b/intelligence_toolkit/query_text_data/api.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024 Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE file in the project.
-
+from collections import defaultdict
 from enum import Enum
 
 import networkx as nx
@@ -316,5 +316,27 @@ def prepare_for_new_answer(self) -> None:
         self.answer_object = None
         self.stage = QueryTextDataStage.CHUNKS_MINED
 
+    def get_chunks_as_df(self) -> pd.DataFrame:
+        flat_data = []
+        for key, json_list in self.label_to_chunks.items():
+            for json_str in json_list:
+                item_data = {
+                    "file_name": key,
+                    "text_to_label_str": json_str,
+                }
+                flat_data.append(item_data)
+
+        return pd.DataFrame(flat_data)
+
+    def import_chunks_from_str(self, data: str) -> None:
+        chunks_df = pd.read_csv(data)
+        data_imported = defaultdict(list)
+        for _, row in chunks_df.iterrows():
+            key = row["file_name"]
+            row_data = row["text_to_label_str"]
+            data_imported[key].append(row_data)
+
+        self.label_to_chunks = data_imported
+
     def __repr__(self):
         return f"QueryTextData()"
diff --git a/intelligence_toolkit/query_text_data/input_processor.py b/intelligence_toolkit/query_text_data/input_processor.py
@@ -63,7 +63,7 @@ def convert_file_bytes_to_chunks(
 
         if file_name.endswith(".csv"):
             df = pd.read_csv(io.BytesIO(bytes))
-            text_to_chunks = convert_df_to_chunks(df, file_name)
+            text_chunks = convert_df_to_chunks(df, file_name)
         else:
             if file_name.endswith(".pdf"):
                 page_texts = []
@@ -89,7 +89,7 @@ def convert_file_bytes_to_chunks(
                     }
                     text_chunks[index] = dumps(chunk, indent=2, ensure_ascii=False)
 
-            text_to_chunks[file_name] = text_chunks
+        text_to_chunks[file_name] = text_chunks
     return text_to_chunks