-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathProcess SGCN repository source files.py
171 lines (126 loc) · 8.02 KB
/
Process SGCN repository source files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# coding: utf-8
# This notebook provides code that processes the [SGCN/SWAP repository in ScienceBase](https://www.sciencebase.gov/catalog/item/56d720ece4b015c306f442d5) into the SGCN database in the GC2 instance we are experimenting with. It works through all of the items in the repo, grabs the source data file, checks the source data file record count against the current database, wipes the current database for that state/year if there's a problem, inserts all records from the source file, and verifies that the new record count matches.
#
# I use jupyter nbconvert to output this notebook to a python script and run it in its own environment to fully process the repository.
# In[16]:
import requests,io,configparser
from IPython.display import display
import pandas as pd
# In[17]:
# Get API keys and any other config details from a file that is external to the code.
config = configparser.RawConfigParser()
config.read_file(open(r'../config/stuff.py'))
# In[18]:
# Build base URL with API key using input from the external config.
def getBaseURL():
gc2APIKey = config.get('apiKeys','apiKey_GC2_BCB').replace('"','')
apiBaseURL = "https://gc2.mapcentia.com/api/v1/sql/bcb?key="+gc2APIKey
return apiBaseURL
# In[19]:
def getCurrentRecordCount(sgcn_state,sgcn_year):
r = requests.get(getBaseURL()+"&q=SELECT COUNT(*) AS sumstateyear FROM sgcn.sgcn WHERE sgcn_year="+str(sgcn_year)+" AND sgcn_state='"+sgcn_state+"'").json()
return r["features"][0]["properties"]["sumstateyear"]
# In[20]:
def clearStateYear(sgcn_state,sgcn_year):
return requests.get(getBaseURL()+"&q=DELETE FROM sgcn.sgcn WHERE sgcn_year="+str(sgcn_year)+" AND sgcn_state='"+sgcn_state+"'").json()
# In[21]:
def insertSGCNData(record):
q = "INSERT INTO sgcn.sgcn (sourceid,sourcefilename,sourcefileurl,sgcn_state,sgcn_year,scientificname_submitted,commonname_submitted,taxonomicgroup_submitted,firstyear) VALUES ('"+record["sourceid"]+"','"+record["sourcefilename"]+"','"+record["sourcefileurl"]+"','"+record["sgcn_state"]+"',"+str(record["sgcn_year"])+",'"+record["scientificname_submitted"]+"','"+record["commonname_submitted"]+"','"+record["taxonomicgroup_submitted"]+"',"+str(record["firstyear"])+")"
return requests.get(getBaseURL()+"&q="+q).json()
# In[25]:
def stringCleaning(string):
# These are things found in the name strings that cause problems sending the data over the GC2 API to the database
# It might be faster to do this with regex
string = string.replace("'","''")
string = string.replace("--","")
string = string.replace("&","and")
string = string.replace('"',"''")
string = string.replace(";",",")
string = string.replace("#","no.")
return string
# In[26]:
# Query ScienceBase for all SGCN source items
sbQ = "https://www.sciencebase.gov/catalog/items?parentId=56d720ece4b015c306f442d5&format=json&fields=files,tags,dates&max=100"
sbR = requests.get(sbQ).json()
totalRecordsInFiles = 0
# Loop through the repository items and sync data to SGCN database
for item in sbR["items"]:
# Create an iterative structure to contain processed attributes for a given state and year
thisItem = {}
# Set the source id as the ScienceBase Item URI/URL to reference from all final records
thisItem["sourceid"] = item["link"]["url"]
# Extract the state name from the place tag
# NOTE: This is brittle and needs more work. An item could have many tags, and I just shortcutted to this point based on how Abby has managed these items so far.
try:
thisItem["sgcn_state"] = item['tags'][0]['name']
except:
display (thisItem)
break
# Extract the year for this item from the dates collection based on the "Collected" date type
# Break after we find the date collected that we want to use
for date in item["dates"]:
if date["type"] == "Collected":
thisItem["sgcn_year"] = date["dateString"]
break
# Retrieve the current record count in the SGCN database for the state and year
thisItem["startingrecordcount"] = getCurrentRecordCount(thisItem["sgcn_state"],thisItem["sgcn_year"])
# Extract the file we need to process from the files structure
# Break after we find that file - assumes there is only one original file to process
# Still need to deal with the update files
for file in item["files"]:
if file["title"] == "Process ready version of original file":
thisItem["sourcefilename"] = file["name"]
thisItem["sourcefileurl"] = file["url"]
break
# Retrieve the file into a dataframe for processing
# The read_table method with explicit tab separator seems to be pretty reliable and robust directly from ScienceBase file URLs, but this may have to be reexamined in future if it fails
stateData = pd.read_table(thisItem["sourcefileurl"],sep='\t')
totalRecordsThisFile = len(stateData.index)
totalRecordsInFiles = totalRecordsInFiles + totalRecordsThisFile
# Check the total columns in the source data against the starting record count from the database
if thisItem["startingrecordcount"] != totalRecordsThisFile:
# If the number of source records does not match the current database, clear out the database for reprocessing
print (clearStateYear(thisItem["sgcn_state"],thisItem["sgcn_year"]))
print ("Cleared data for: "+thisItem["sgcn_state"]+" - "+str(thisItem["sgcn_year"]))
# Store the source record count for reference
thisItem["sourcerecordcount"] = len(stateData.index)
# Set column names to lower case to deal with pesky human problems
stateData.columns = map(str.lower, stateData.columns)
thisRecord = {}
# Loop the dataframe, fix text values, and load to SGCN database
# NaN values from the dataframe are nulls, show up as float type, and need to get set to a blank string
for index, row in stateData.iterrows():
if type(row['scientific name']) is float:
thisRecord["scientificname_submitted"] = ""
else:
thisRecord["scientificname_submitted"] = stringCleaning(row['scientific name'])
if type(row['common name']) is float:
thisRecord["commonname_submitted"] = ""
else:
thisRecord["commonname_submitted"] = stringCleaning(row['common name'])
thisRecord["taxonomicgroup_submitted"] = ""
if 'taxonomy group' in stateData.columns:
thisRecord["taxonomicgroup_submitted"] = row['taxonomy group']
elif 'taxonomic group' in stateData.columns:
thisRecord["taxonomicgroup_submitted"] = row['taxonomic group']
thisRecord["firstyear"] = False
if '2005 swap' in stateData.columns:
if row['2005 swap'] in ["N","n","No","no"]:
thisRecord["firstyear"] = True
# Add in repository item metadata
thisRecord["sourceid"] = thisItem["sourceid"]
thisRecord["sourcefilename"] = thisItem["sourcefilename"]
thisRecord["sourcefileurl"] = thisItem["sourcefileurl"]
thisRecord["sgcn_state"] = thisItem["sgcn_state"]
thisRecord["sgcn_year"] = thisItem["sgcn_year"]
# Insert the record
print(insertSGCNData(thisRecord))
# Check total record count after inserting new data to make sure the numbers line up
if thisItem["sourcerecordcount"] != getCurrentRecordCount(thisItem["sgcn_state"],thisItem["sgcn_year"]):
print ("Something went wrong with "+thisItem["sgcn_state"],thisItem["sgcn_year"],thisItem["sourceid"])
else:
print (thisItem["sgcn_state"],thisItem["sgcn_year"])
print ("Source record count: "+str(thisItem["sourcerecordcount"]))
print ("Database record count: "+str(getCurrentRecordCount(thisItem["sgcn_state"],thisItem["sgcn_year"])))
else:
print ("Record Numbers Matched: "+thisItem["sgcn_state"]+" - "+str(thisItem["sgcn_year"]))