nanjiangwill · nanjiangwill · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 31, 2024
diff --git a/.env.example b/.env.example
@@ -6,3 +6,6 @@ export AWS_SECRET_ACCESS_KEY=
 export OPENAI_API_KEY=
 export ANTHROPIC_API_KEY=
 export WANDB_API_KEY=
+
+export AWS_ACCESS_KEY_ID=
+export AWS_SECRET_ACCESS_KEY=
diff --git a/.gitignore b/.gitignore
@@ -1,19 +1,32 @@
 .diskcache
 .env
+.idea
 .mypy_cache
+.streamlit
+.venv
+.vscode
+Procfile
 data/*/eval
 data/*/format_ocr
 data/*/index
 data/*/llm
 data/*/normalization
 data/*/ocr
+data/*/original_ocr
 data/*/pdfs
 data/*/prompt
 data/*/search
 data/.DS_Store
 hydra_outputs
+key.json
 old_eval.py
-results/
+old_results
+results/*/*/*.json
+results/*/district_extraction
+results/*/district_extraction_verification
+results/*/page_embedding
+runtime.txt
+setup.sh
 wandb/
 zoning.egg-info
 zoning.egg-info/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+exclude: '^results/'
+
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: 'v4.6.0'
@@ -18,7 +20,7 @@ repos:
     -   id: black
     -   id: black-jupyter
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.10.0'  # Use the sha / tag you want to point at
+    rev: 'v1.11.0'  # Use the sha / tag you want to point at
     hooks:
     -   id: mypy
         # args: ['--explicit-package-bases']

diff --git a/Procfile b/Procfile
@@ -0,0 +1 @@
+web: sh setup.sh && streamlit run viz/viz_user_mode_batch.py
diff --git a/config/base.yaml b/config/base.yaml
@@ -6,7 +6,7 @@ global_config:
   experiment_dir: results/${global_config.experiment_name} # helper variable, just used to parse
 
   target_state: connecticut
-  eval_terms: ["min_lot_size", "min_unit_size", "max_height"] # all available eval terms ['floor_to_area_ratio', 'max_height', 'max_lot_coverage', 'max_lot_coverage_pavement', 'min_lot_size', 'min_parking_spaces', 'min_unit_size']
+  eval_terms: ["min_lot_size", "min_unit_size", "max_height"] # all available eval terms ['floor_to_area_ratio', 'max_height', 'max_lot_coverage', 'max_lot_coverage_pavement', 'min_lot_size', 'min_parking_spaces', 'min_unit_size', 'units_per_acre']
 
   data_dir: data/${global_config.target_state}
   target_town_file: ${global_config.data_dir}/target_towns_names.json
@@ -18,9 +18,12 @@ global_config:
   result_output_dir: results/${global_config.target_state}/${global_config.experiment_name} # helper variable, just used to parse
 
   pdf_dir: ${global_config.data_dir}/pdfs   # normally we dont redo pdf collection, we just save them in data
-  ocr_dir: ${global_config.data_dir}/ocr    # normally we dont redo ocr collection, we just save them in data, not in experiment results
+  ocr_dir: ${global_config.data_dir}/original_ocr    # normally we dont redo ocr collection, we just save them in data, not in experiment results
 
   format_ocr_dir: ${global_config.experiment_dir}/format_ocr
+  page_embedding_dir: ${global_config.experiment_dir}/page_embedding
+  district_extraction_dir: ${global_config.experiment_dir}/district_extraction
+  district_extraction_verification_dir: ${global_config.experiment_dir}/district_extraction_verification
   index_dir: ${global_config.experiment_dir}/index
   search_dir: ${global_config.experiment_dir}/search
   prompt_dir: ${global_config.experiment_dir}/prompt
@@ -40,13 +43,25 @@ global_config:
 ocr_config:
   method: textract
   run_ocr: false
-  input_document_s3_bucket:
-  pdf_name_prefix_in_s3_bucket: zoning/${global_config.target_state}/
+  textract_region_name: us-east-2
+  input_document_s3_bucket: zoning-nan
+  pdf_name_prefix_in_s3_bucket: ${global_config.target_state}
   feature_types: ["TABLES"]  # allowed ["TABLES", "FORMS", "QUERIES", "SIGNATURES", "LAYOUT"]
 
 format_ocr_config:
   temp: x
 
+district_extraction_config:
+  run_district_extraction: false
+  embedding_model: text-embedding-3-small
+  llm_model: ${llm_config.llm_name}
+  templates_dir: ${prompt_config.templates_dir}
+  system_prompt_file: district_extraction_system
+  user_prompt_file: district_extraction_user
+  verification_es_endpoint: ${global_config.es_endpoint}
+  target_districts_file: ${global_config.target_district_file}
+  district_page_mapping_file: ${global_config.data_dir}/district_page_mapping.json
+
 index_config:
   method: keyword # allowed keyword/embedding
   index_key: town
@@ -70,7 +85,7 @@ prompt_config:
 
 llm_config:
   llm_name: gpt-4-1106-preview
-  max_tokens: 256
+  max_tokens: 512
   formatted_response: false
   cache_dir: .diskcache
 

diff --git a/config/templates/district_extraction_system.pmpt.tpl b/config/templates/district_extraction_system.pmpt.tpl
@@ -0,0 +1,7 @@
+You are an expert information extraction system. You are given a
+passage that shows the zoning districts of as town and their
+abbreviations. Your Job is to list the zoning districts and these
+abbreviations.  Only output districts that have abbreviations.
+Please output the answer only with JSON (no text) in the format:
+
+[{"T": "district type", "Z": "district abbreviation with number"}].
diff --git a/config/templates/district_extraction_user.pmpt.tpl b/config/templates/district_extraction_user.pmpt.tpl
@@ -0,0 +1,68 @@
+Passage:
+
+Some text about buildings
+
+Output:
+
+[]
+
+Passage:
+
+* Residential (R) districts
+
+CELL
+Residential
+CELL
+R-10
+CELL
+R-20
+CELL
+
+Output:
+
+[{"T": "Residential",  "Z": "R-10"}, {"T": "Residential",  "Z": "R-20"}]
+
+Passage:
+
+* Business (C) districts:
+
+(C19) Commercial 19
+(C29) Commercial 29
+
+Output:
+
+[{"T": "Commercial 19",  "Z": "C19"}, {"T": "Commercial 29",  "Z": "C29"}]
+
+Passage:
+
+CELL
+Residential Districts
+CELL
+R-5 District
+R-10 District
+R-20 District
+
+Output:
+
+[{"T": "R-5 Residential",  "Z": "R-5"}, {"T": "R-10 Residential",  "Z": "R-10"}, {"T": "R-20 Residential",  "Z": "R-20"}]
+
+Passage:
+
+Residence AAA District
+Residence B District
+Historic Design District (HDD)
+
+Output:
+
+[{"T": "Residence AAA",  "Z": "AAA"}, {"T": "Residence B",  "Z": "B"}, {"T": "Historic Design",  "Z": "HDD"}]
+
+Passage:
+
+{% macro showdocs(docs) -%}
+{% for doc in docs %}
+* {{doc}}
+{% endfor %}
+{% endmacro %}
+{{showdocs(docs) | truncate(1200*4)}}
+
+Output:
diff --git a/config/templates/few_shot.pmpt.tpl b/config/templates/few_shot.pmpt.tpl
@@ -1,25 +1,32 @@
 # Instructions
 
-You are an expert architectural lawyer. You are looking for facts inside a
-document about a Zoning District with the name "{{zone_name}}" and with an
-abbreviated name "{{zone_abbreviation}}".
+You are an expert architectural lawyer tasked with extracting specific zoning information from a
+document. Your goal is to find facts about a particular Zoning District with the name "{{zone_name}}" and with an
+abbreviated name "{{zone_abbreviation}}
 
-You are looking to find the value for "{{term}}", which also goes by the
+You are looking to find the value for "{{term}}", which may also be referred to by the
 following other names: {{synonyms}}. Only output values that are seen in the
 input and do not guess! Output MUST be valid JSON, and should follow the schema
-detailed below. Ensure that the field "extracted_text" does not span multiple
-lines and that it is a real substring of the input. You CANNOT make up a value
-for "extracted_text", and it MUST be a substring! "extracted_text" will be used
-in the python statement `extracted_text in input` and if that returns False, the
-universe will be destroyed! If you cannot extract reasonable text, then you
-should not return an answer. For {{term}} in residential districts, we are only
-interested in the answer as it pertains to single-family homes.
+detailed below. Ensure that, in the field "extracted_text", the first element of
+the inner list does not span multiple lines and that it is a real substring of the input.
+You CANNOT make up a value for "extracted_text", and it MUST be a substring!
+"extracted_text" will be used in the python statement `extracted_text in input`
+and if that returns False, the universe will be destroyed! If you cannot extract
+reasonable text, then you should not return an answer. If {{zone_name}}
+({{zone_abbreviation}}) is referring to a general residential district,
+we are only interested in the requirement of {{term}} for single-family homes.
+However, if it is referring to a specific district, like Multi Family Residential (MFR),
+General Commercial (GC), etc., we are still interested in the requirement of {{term}}
+for {{zone_name}} ({{zone_abbreviation}}). Remeber, the text given to you is a
+document that is part of a larger document, which means you might find answer that is
+not for the zone "{{zone_name}} ({{zone_abbreviation}})" but for other zones.
+Double-check your answer to ensure it corresponds to the correct zone district "{{zone_name}}"
 
 # Schema
 {
-    "extracted_text": list[str], // The verbatim text from which the result was extracted. ONLY USE VALUES EXTRACTED DIRECTLY FROM THE TEXT. Make sure to include "\n" and any type of special characters.
+    "extracted_text": List[List[str, int]], // A list of lists. Each inner list must contain exactly two elements: The first element is a string representing the verbatim text from which the result was extracted. ONLY USE VALUES EXTRACTED DIRECTLY FROM THE TEXT. Make sure to include \n and any special characters and DO NOT span multiple lines. The second element is an integer representing the page where the verbatim text is found. Multiple extracted texts from different pages may correspond to the answer, so the extracted_text field should always be a list of lists, even if only one inner list is present."
     "rationale": str, // A string containing a natural language explanation for the following answer
-    "answer": str // The value of {{term}} extracted from the text. Answer must include units and must be normalized, e.g. (sqr. ft. becomes sq ft)
+    "answer": str // A string representing the value of {{term}} extracted from the text. Answer must include units and must be normalized, e.g. (sqr. ft. becomes sq ft)
 }
 
 {% include term + "_examples.pmpt.tpl" %}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		web: sh setup.sh && streamlit run viz/viz_user_mode_batch.py