Skip to content

Commit

Permalink
New vocab, copy date_cleaners from pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
azaroth42 committed Oct 5, 2020
1 parent ecdb2e1 commit d55e207
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 6 deletions.
237 changes: 237 additions & 0 deletions cromulent/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,3 +497,240 @@ def extract_monetary_amount(data, add_citations=False, currency_mapping=CURRENCY
amnt._label = '%s' % (price_amount,)
return amnt
return None



# Datetime Cleaning (from Getty Pipeline code)
# https://github.com/thegetty/pipeline/blob/master/pipeline/util/cleaners.py

def ymd_to_datetime(year, month, day, which="begin"):
if not isinstance(year, int):
try:
year = int(year)
except:
# print("DATE CLEAN: year is %r; returning None" % year)
return None

if not isinstance(month, int):
try:
month = int(month)
except:
# print("DATE CLEAN: month is %r; continuing with %s" % (month, "earliest" if which=="begin" else "latest"))
month = None

if not isinstance(day, int):
try:
day = int(day)
except:
day = None

if not month or month > 12 or month < 1:
if which == "begin":
month = 1
else:
month = 12

maxday = calendar.monthrange(year, month)[1]
if not day or day > maxday or day < 1:
if which == "begin":
day = 1
else:
# number of days in month
day = maxday

ystr = "%04d" % abs(year)
if year < 0:
ystr = "-" + ystr

if which == "begin":
return "%s-%02d-%02dT00:00:00" % (ystr, month, day)
else:
return "%s-%02d-%02dT23:59:59" % (ystr, month, day)



def date_parse(value, delim):
# parse a / or - or . date or range

bits = value.split(delim)
if len(bits) == 2:
# YYYY/ range
b1 = bits[0].strip()
b2 = bits[1].strip()
if len(b2) < 3 :
b2 = "%s%s" % (b1[:len(b1)-len(b2)], b2)
elif len(b2) > 4:
print("Bad range: %s" % value)
return None
try:
return [datetime(int(b1),1,1), datetime(int(b2)+1,1,1)]
except:
print("Broken delim: %s" % value)
return None
elif len(bits) == 3:
# YYYY/MM/DD or YY/YY/YYYY or DD.MM.YYYY or YYYY.MM.DD
m = int(bits[1])
if len(bits[0]) == 4:
y = int(bits[0])
d = int(bits[2])
else:
y = int(bits[2])
d = int(bits[0])
if m == 0:
m = 1
if d == 0:
d = 1
if m > 12:
# swap them
d, m = m, d
try:
yearmonthday = datetime(y,m,d)
return [yearmonthday, yearmonthday+timedelta(days=1)]
except:
print("Bad // value: %s" % value)
else:
print("broken / date: %s" % value)
return None



def date_cleaner(value):

# FORMATS:

# YYYY[?]
# YYYY/MM/DD
# DD/MM/YYYY
# ca. YYYY
# aft[er|.] YYYY
# bef[ore|.] YYYY
# YYYY.MM.DD
# YYYY/(Y|YY|YYYY)
# YYYY-YY
# YYY0s
# YYYY-
# YYYY Mon
# YYYY Month DD

if value:
value = value.replace("?",'')
value = value.replace('est', '')
value = value.replace("()", '')
value = value.replace(' or ', '/')
value = value.strip()
value = value.replace('by ', 'bef.')
value = value.replace('c.', 'ca.')
value = value.replace('CA.', 'ca.')
value = value.replace('af.', 'aft.')

if not value:
return None

elif value.startswith("|"):
# Broken? null it out
return None

elif len(value) == 4 and value.isdigit():
# year only
return [datetime(int(value),1,1), datetime(int(value)+1,1,1)]

elif value.startswith('v.'):
value = value[2:].strip()
return None

elif value.endswith('s'):
# 1950s
if len(value) == 5 and value[:4].isdigit():
y = int(value[:4])
return [datetime(y,1,1), datetime(y+10,1,1)]
else:
warnings.warn("Bad YYYYs date: %s" % value)
return None

elif len(value) == 5 and value[:4].isdigit() and value.endswith('-'):
y = int(value[:4])
return [datetime(y,1,1), None]

elif value.startswith("ca"):
# circa x
value = value[3:].strip()
if len(value) == 4 and value.isdigit():
y = int(value)
return [datetime(y-CIRCA,1,1), datetime(y+CIRCA,1,1)]
else:
# Try and parse it
if value.find('/') > -1:
val = date_parse(value, '/')
elif value.find('-') > -1:
val = date_parse(value, '-')

if not val:
warnings.warn("bad circa: %s" % value)
return None

val[0] -= CIRCA_D
val[1] += CIRCA_D
return val

elif value.startswith('aft'):
# after x
value = value.replace('aft.', '')
value = value.replace('after ', '')
value = value.strip()
try:
y = int(value)
except:
warnings.warn("Bad aft value: %s" % value)
return None
return [datetime(y,1,1), None]

elif value.startswith('bef'):
value = value.replace('bef.', '')
value = value.replace('before ', '')
value = value.strip()
y = int(value)
return [None, datetime(y,1,1)]

elif value.find('/') > -1:
# year/year or year/month/date
# 1885/90
# 07/02/1897
return date_parse(value, '/')

elif value.find('.') > -1:
return date_parse(value, '.')

elif value.find('-') > -1:
return date_parse(value, '-')

elif value.find(';') > -1:
return date_parse(value, ';')

else:
with c_locale(), suppress(ValueError):
yearmonthday = datetime.strptime(value, '%Y %B %d')
if yearmonthday:
return [yearmonthday, yearmonthday+timedelta(days=1)]

with c_locale(), suppress(ValueError):
yearmonth = datetime.strptime(value, '%Y %b')
if yearmonth:
year = yearmonth.year
month = yearmonth.month
maxday = calendar.monthrange(year, month)[1]
d = datetime(year, month, 1)
r = [d, d+timedelta(days=maxday)]
return r

warnings.warn(f'fell through to: {value!r}')
return None

@contextmanager
def c_locale():
l = locale.getlocale()
locale.setlocale(locale.LC_ALL, 'C')
try:
yield
finally:
locale.setlocale(locale.LC_ALL, l)

22 changes: 16 additions & 6 deletions cromulent/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def register_instance(name, data):
"Diameter": {"parent": Dimension, "id":"300055624", "label": "Diameter"},
"Weight": {"parent": Dimension, "id":"300056240", "label": "Weight"},
"Color": {"parent": Dimension, "id":"300080438", "label": "Color"}, # individual colors as dimensions, 56130 is concept of color
"Length": {"parent": Dimension, "id":"300055645", "label": "Length"},
"Thickness": {"parent": Dimension, "id":"300055646", "label": "Thickness"},
"SequencePosition": {"parent": Dimension, "id":"300010269", "label":"Sequence Position"},
"PhysicalDimension": {"parent": Dimension, "id":"300055642", "label":"Unknown physical dimension"},

Expand Down Expand Up @@ -373,31 +375,39 @@ def register_instance(name, data):
"silver": {"parent": Material, "id": "300011029", "label": "silver"},
"synthetic": {"parent": Type, "id": "xxx", "label": "Synthetic Material"},

# Measurement Units
# Measurement Units - lengths, weights, durations
"lignes": {"parent": MeasurementUnit, "id": "300435501", "label": "Paris lines"},
"fr_inches": {"parent": MeasurementUnit, "id": "300435502", "label": "Paris inches"},
"fr_feet": {"parent": MeasurementUnit, "id":"300435503", "label": "Paris feet"},
"inches": {"parent": MeasurementUnit, "id": "300379100", "label": "inches"},
"feet": {"parent": MeasurementUnit, "id":"300379101", "label": "feet"},
"mm": {"parent": MeasurementUnit, "id": "300379097", "label": "millimeters"},
"cm": {"parent": MeasurementUnit, "id": "300379098", "label": "centimeters"},
"meters": {"parent": MeasurementUnit, "id": "300379099", "label": "meters"},
"braccia": {"parent": MeasurementUnit, "id": "300404161", "label": "braccia"},
"ells": {"parent": MeasurementUnit, "id": "300412070", "label": "ells"},
"grams": {"parent": MeasurementUnit, "id": "300379225", "label": "grams"},
"kilograms": {"parent": MeasurementUnit, "id": "300379226", "label": "kilograms"},
"ounces":{"parent": MeasurementUnit, "id": "300379229", "label": "ounces"},
"pounds": {"parent": MeasurementUnit, "id": "300379254", "label": "pounds"},
"seconds": {"parent": MeasurementUnit, "id": "300379239", "label": "seconds"},
"minutes": {"parent": MeasurementUnit, "id": "300379240", "label": "minutes"},
"hours": {"parent": MeasurementUnit, "id": "300379241", "label": "hours"},
"days": {"parent": MeasurementUnit, "id": "300379242", "label": "days"},
"months": {"parent": MeasurementUnit, "id": "300379245", "label": "months"},
"years": {"parent": MeasurementUnit, "id": "300379244", "label": "years"},

"percent": {"parent": MeasurementUnit, "id": "300417377", "label": "percent"},
"numbers": {"parent": MeasurementUnit, "id": "300055665", "label": "numbers"},
"bytes": {"parent": MeasurementUnit, "id": "300265869", "label": "bytes"},
"kilobytes": {"parent": MeasurementUnit, "id": "300265870", "label": "kilobytes"},
"megabytes": {"parent": MeasurementUnit, "id": "300265873", "label": "megabytes"},
"gigabytes": {"parent": MeasurementUnit, "id": "300265874", "label": "gigabytes"},
"terabytes": {"parent": MeasurementUnit, "id": "300266477", "label": "terabytes"},
"partsUnit": {"parent": MeasurementUnit, "id": "300404159", "label": "parts"},
"pageCount": {"parent": MeasurementUnit, "id": "300194222", "label": "pages"},
"pixels": {"parent": MeasurementUnit, "id": "300266190", "label": "pixels"},
"rgb_colorspace": {"parent": MeasurementUnit, "id": "300266239", "label": "rgb"},
"seconds": {"parent": MeasurementUnit, "id": "300379239", "label": "seconds"},
"days": {"parent": MeasurementUnit, "id": "300379242", "label": "days"},
"months": {"parent": MeasurementUnit, "id": "300379245", "label": "months"},
"years": {"parent": MeasurementUnit, "id": "300379244", "label": "years"},
"partsUnit": {"parent": MeasurementUnit, "id": "300404159", "label": "parts"},

# Languages
"english": {"parent": Language, "id": "300388277", "label": "English"},
Expand Down

0 comments on commit d55e207

Please sign in to comment.