Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pin more stats and remove photos.ndjson from geocoding #178

Merged
merged 7 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/e2etest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,14 @@ jobs:
run: |
export PYTHONPATH=.
tar -xzf geocache.tgz
poetry run oldnyc/geocode/geocode.py --images_ndjson data/photos.ndjson --lat_lon_map data/lat-lon-map.txt --output_format lat-lon-to-ids.json --geocode > data/lat-lon-to-ids.json
poetry run oldnyc/geocode/geocode.py --images_ndjson data/images.ndjson --lat_lon_map data/lat-lon-map.txt --output_format lat-lon-to-ids.json --geocode > data/lat-lon-to-ids.json 2> >(tee >( sed -n '/Finalizing/,$p' > test/geocoding-stats.txt) >&2)
- name: Generate static site
run: |
export PYTHONPATH=.
echo '{"fixes": {}}' > data/feedback/fixes.json
poetry run oldnyc/site/generate_static_site.py --leave-timestamps-unchanged
poetry run oldnyc/site/generate_static_site.py --leave-timestamps-unchanged 2> >(tee test/site-stats.txt >&2)
- name: Check for diffs
run: |
git diff --exit-code test/
cd ../oldnyc.github.io
git diff --exit-code
2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

Binary file modified geocache.tgz
Binary file not shown.
15 changes: 13 additions & 2 deletions oldnyc/site/generate_static_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,19 @@
popular_photos: list[PopularPhoto] = json.loads(open("data/popular-photos.js").read()[20:-2])
pop_ids = {x["id"] for x in popular_photos}

lat_lon_to_ids: dict[str, list[str]] = json.load(open("data/lat-lon-to-ids.json"))
lat_lon_to_item_ids: dict[str, list[str]] = json.load(open("data/lat-lon-to-ids.json"))

rs = load_items("data/photos.ndjson")
id_to_record = {r.id: r for r in rs}
item_id_to_photo_ids = defaultdict[str, list[str]](list)
for r in rs:
item_id = r.id.split("-")[0]
item_id_to_photo_ids[item_id].append(r.id)

lat_lon_to_ids = {
ll: [photo_id for item_id in item_ids for photo_id in item_id_to_photo_ids[item_id]]
for ll, item_ids in lat_lon_to_item_ids.items()
}

id_to_dims: dict[str, tuple[int, int]] = {}
for photo_id, width, height in csv.reader(open("data/nyc-image-sizes.txt")):
Expand Down Expand Up @@ -223,9 +232,10 @@ def site_response_to_site_json(r: SiteResponse, latlon: tuple[float, float]) ->
all_photos.append(site_response_to_site_json(response, (lat, lon)))

photo_ids_on_site = {photo["photo_id"] for photo in all_photos}
item_ids_on_site = {id.split("-")[0] for id in photo_ids_on_site}

missing_popular = {id_ for id_ in pop_ids if id_ not in photo_ids_on_site}
sys.stderr.write(f"Missing popular: {missing_popular}\n")
sys.stderr.write(f"Missing popular: {sorted(missing_popular)}\n")
print("Date extraction stats:")
dates_from_text.log_stats()

Expand Down Expand Up @@ -294,6 +304,7 @@ def site_response_to_site_json(r: SiteResponse, latlon: tuple[float, float]) ->
with open("../oldnyc.github.io/static/timestamps.js", "w") as f:
f.write(f"var timestamps = {timestamps_json};")

sys.stderr.write(f"NYPL items on site: {len(item_ids_on_site)}\n")
sys.stderr.write(f"Unique photos on site: {len(photo_ids_on_site)}\n")
sys.stderr.write(f"Text-less photos: {len(textless_photo_ids)}\n")
sys.stderr.write(f"Unique lat/lngs: {len(latlon_to_count)}\n")
Expand Down
51 changes: 51 additions & 0 deletions test/geocoding-stats.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
-- Finalizing title-cross --
titles matched: 0
alt titles matched: 0
total matches: 31270
counters: [('boro-int', 30503), ('title', 30096), ('alt_title', 1174), ('at-int', 288), ('num-prefix', 271), ('between', 208)]
grid: 7813 (18776 attempts)
google: 17946
boro mismatch: 504
failures: 4850
-- Finalizing title-address --
address matches: 638
patterns: [('street_pound', 427), ('num_street', 211)]
google: 608
boro mismatch: 13
failures: 17
-- Finalizing gpt --
GPT POI: 5096
GPT address: 721
GPT intersection: 7359
grid: 406 (3697 attempts)
google: 2259
boro mismatch: 499
failures: 4916
-- Finalizing special --
Special cases: [('Columbus Circle', 47), ('China Daily News', 23), ('St. John the Divine', 7), ('Squatters: Camp Thomas Paine', 6), ('Mt. Sinai', 5)]
-- Finalizing subjects --
POI/subject geocoding:
1034 n_both
2262 n_geo
127 n_geo_multi
2135 n_geo_unambig
858 n_out_both_close
30 n_out_both_fallback_title
40 n_out_both_subject
106 n_out_both_title
1141 n_out_subject
1204 n_out_title
1244 n_title
363 n_title_bridge
260 n_title_island
621 n_title_park
-- Final stats --
25759 title-cross
608 title-address
2665 gpt
88 special
2345 subjects
31465 (total)
Dropped w/ no date: 0
Unique lat/longs: 9432
Total photographs: 31465
6 changes: 6 additions & 0 deletions test/site-stats.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Missing popular: ['716608f-a', '719363f-a', '721912f-b', '726358f-c']
NYPL items on site: 31465
Unique photos on site: 45562
Text-less photos: 1142
Unique lat/lngs: 9432
Orphaned popular photos: 4 / 54