Skip to content

Commit f3ba9d7

Browse files
First push
0 parents  commit f3ba9d7

File tree

8 files changed

+375
-0
lines changed

8 files changed

+375
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

README.md

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
2+
# BigQuery Autogen Schema
3+
4+
This is a tool used to convert .JSON strings (inside JSON files) into acceptable Python BigQuery schema lists.
5+
6+
It's not completely full proof as support for key/value pairs in JSON files where the value is a whitespace delimited string fails to parse properly. Open to any patches (or I'll just patch it sometime).
7+
8+
### Example:
9+
JSON input file
10+
```
11+
cat testcase/test.json
12+
{"elements_0_role": "ADMINISTRATOR", "elements_0_roleAssignee": "urn:li:person:AFnCO-Sd9a", "elements_0_state": "APPROVED", "elements_0_organizationalTarget": "dotmodus", "paging_count": 10, "paging_start": 0, "fake_field": {"name": "Sephiroth", "age": 99}, "final_record": 42 }
13+
```
14+
Output :
15+
```
16+
python3 bigquery_schema_gen.py testcase/test.json
17+
schema = [
18+
{
19+
"mode": "NULLABLE",
20+
"name": "elements_0_role",
21+
"type": "STRING"
22+
},
23+
{
24+
"mode": "NULLABLE",
25+
"name": "elements_0_roleAssignee",
26+
"type": "STRING"
27+
},
28+
{
29+
"mode": "NULLABLE",
30+
"name": "elements_0_state",
31+
"type": "STRING"
32+
},
33+
{
34+
"mode": "NULLABLE",
35+
"name": "elements_0_organizationalTarget",
36+
"type": "STRING"
37+
},
38+
{
39+
"mode": "NULLABLE",
40+
"name": "paging_count",
41+
"type": "INTEGER"
42+
},
43+
{
44+
"mode": "NULLABLE",
45+
"name": "paging_start",
46+
"type": "INTEGER"
47+
},
48+
{
49+
"mode": "REPEATED",
50+
"name": "fake_field",
51+
"type": "RECORD",
52+
"fields": [
53+
{
54+
"mode": "NULLABLE",
55+
"name": "name",
56+
"type": "STRING"
57+
},
58+
{
59+
"mode": "NULLABLE",
60+
"name": "age",
61+
"type": "INTEGER"
62+
},
63+
]
64+
},
65+
{
66+
"mode": "NULLABLE",
67+
"name": "final_record",
68+
"type": "INTEGER"
69+
},
70+
]
71+
```
72+
## Important
73+
A lot of the parsing will be handled for basic fields, but the resulting Python schema must be looked over for inconsisencies.
74+
75+
76+
- Brutally hacked together on a Friday afternoon by AlysonNgonyama
77+
78+
79+

__pycache__/unpack.cpython-36.pyc

1.76 KB
Binary file not shown.

bigquery_schema_gen.py

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
2+
import sys
3+
from unpack import unpack_json_file, start_index, end_index
4+
5+
def query_level(level):
6+
x = 0
7+
while x < level:
8+
print(' ', end='')
9+
x += 1
10+
11+
def small_indent(level):
12+
x = 0
13+
while x < level:
14+
print(' ', end='')
15+
x += 1
16+
17+
def display_field(mode, name, type_name, indent_level):
18+
mode = mode + ','
19+
name = name + ','
20+
if type_name == '"RECORD",':
21+
mode = '"REPEATED",'
22+
query_level(indent_level+2)
23+
print('{')
24+
query_level(indent_level+3)
25+
print('"mode":', mode)
26+
query_level(indent_level+3)
27+
print('"name":', name)
28+
query_level(indent_level+3)
29+
print('"type":', type_name)
30+
31+
32+
def repeated_loop(val, indent_level, key):
33+
val = val[start_index(val, '[') + 2:end_index(val, ']') - 1]
34+
repeated_items = val.split('} {')
35+
for item in repeated_items:
36+
item = '{' + item + '}'
37+
bigquery_schema_gen(item.replace(' ', ',').replace(':,', ':'), indent_level+1)
38+
39+
def convert_to_schema_block(key, val, indent_level):
40+
schema_value_type = ''
41+
val = val.strip()
42+
if len(val) == 0:
43+
return
44+
if val.isdigit():
45+
schema_value_type = '"INTEGER"'
46+
elif val[0] == '[':
47+
schema_value_type = '"RECORD",'
48+
display_field('"NULL"', key.strip(), schema_value_type, indent_level)
49+
query_level(indent_level+3)
50+
print('"fields": [')
51+
repeated_loop(val, indent_level+1, key.strip())
52+
query_level(indent_level+3)
53+
print(']')
54+
query_level(indent_level+2)
55+
print('},')
56+
return
57+
elif val[0] == '{':
58+
schema_value_type = '"RECORD",'
59+
display_field('"NULLABLE"', key.strip(), schema_value_type, indent_level)
60+
query_level(indent_level+3)
61+
print('"fields": [')
62+
bigquery_schema_gen(val.strip(), indent_level+2)
63+
query_level(indent_level+3)
64+
print(']')
65+
query_level(indent_level+2)
66+
print('},')
67+
return
68+
elif val[0] == '"':
69+
schema_value_type = '"STRING"'
70+
elif '.' in val:
71+
schema_value_type = '"FLOAT"'
72+
display_field('"NULLABLE"', key.strip(), schema_value_type, indent_level)
73+
query_level(indent_level+2)
74+
print('},')
75+
76+
77+
78+
def cleanly_separate_key_values(line):
79+
"""Find the delimiter that separates key from value.
80+
81+
Splitting with .split() often yields inaccurate results
82+
as some values have the same delimiter value ':', splitting
83+
the string too inaccurately.
84+
"""
85+
index = line.find(':')
86+
key = line[:index]
87+
value = line[index + 1:]
88+
return key, value
89+
90+
91+
def bigquery_schema_gen(output, indent_level):
92+
"""Automate the generation of BigQuery schema if json file is flattened."""
93+
new_content = output
94+
if '[' in output:
95+
new_content = unpack_json_file(output, '[', ']')
96+
if '{' in output:
97+
new_content = unpack_json_file(output, '{', '}')
98+
for values in new_content.split(','):
99+
key = values[:values.find(':')]
100+
val = values[values.find(':') + 1:]
101+
val = val.replace('***', ',').replace('^', ' ')
102+
convert_to_schema_block(key, val, indent_level)
103+
104+
105+
if len(sys.argv) != 2:
106+
print('Usage : %s [json file]' % sys.argv[0])
107+
sys.exit(1)
108+
if not sys.argv[1].endswith('json'):
109+
print('Error : file must be a .json file')
110+
sys.exit(1)
111+
112+
113+
print('schema = [')
114+
bigquery_schema_gen(open(sys.argv[1]).read(), 1)
115+
print(']')
116+
117+
118+
119+
120+
121+
122+
123+
124+
125+

testcase/test.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"elements_0_role": "ADMINISTRATOR", "elements_0_roleAssignee": "urn:li:person:AFnCO-Sd9a", "elements_0_state": "APPROVED", "elements_0_organizationalTarget": "dotmodus", "paging_count": 10, "paging_start": 0, "fake_field": {"name": "Sephiroth", "age": 99}, "final_record": 42 }

testcase/test2.json

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"id":"1","first_name":"John","last_name":"Doe","dob":"1968-01-22","addresses":[{"status":"current","address":"123FirstAvenue","city":"Seattle","state":"WA","zip":"11111","numberOfYears":"1"},{"status":"previous","address":"456MainStreet","city":"Portland","state":"OR","zip":"22222","numberOfYears":"5"}]}
2+

testcase/testfile

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"mode": "NULLABLE"
3+
"name": "elements"
4+
"type": "REPEATED"
5+
{
6+
"mode": "NULLABLE"
7+
"name": "totalShareStatistics"
8+
"type": "RECORD"
9+
"fields": [
10+
{
11+
"mode": "NULLABLE"
12+
"name": "shareCount"
13+
"type": "INTEGER"
14+
}
15+
{
16+
"mode": "NULLABLE"
17+
"name": "uniqueImpressionsCount"
18+
"type": "INTEGER"
19+
}
20+
{
21+
"mode": "NULLABLE"
22+
"name": "clickCount"
23+
"type": "INTEGER"
24+
}
25+
{
26+
"mode": "NULLABLE"
27+
"name": "engagement"
28+
"type": "FLOAT"
29+
}
30+
{
31+
"mode": "NULLABLE"
32+
"name": "shareMentionsCount"
33+
"type": "INTEGER"
34+
}
35+
{
36+
"mode": "NULLABLE"
37+
"name": "likeCount"
38+
"type": "INTEGER"
39+
}
40+
{
41+
"mode": "NULLABLE"
42+
"name": "impressionCount"
43+
"type": "INTEGER"
44+
}
45+
{
46+
"mode": "NULLABLE"
47+
"name": "commentMentionsCount"
48+
"type": "INTEGER"
49+
}
50+
{
51+
"mode": "NULLABLE"
52+
"name": "commentCount"
53+
"type": "INTEGER"
54+
}
55+
]
56+
}
57+
{
58+
"mode": "NULLABLE"
59+
"name": "organizationalEntity"
60+
"type": "STRING"
61+
}
62+
}
63+
{
64+
"mode": "NULLABLE"
65+
"name": "paging"
66+
"type": "RECORD"
67+
"fields": [
68+
{
69+
"mode": "NULLABLE"
70+
"name": "count"
71+
"type": "INTEGER"
72+
}
73+
{
74+
"mode": "NULLABLE"
75+
"name": "start"
76+
"type": "INTEGER"
77+
}
78+
{
79+
"mode": "NULLABLE"
80+
"name": "links"
81+
"type": "REPEATED"
82+
}
83+
]
84+
}

unpack.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
2+
3+
4+
def start_index(content, b):
5+
start = 0
6+
while start < len(content):
7+
if content[start] == b:
8+
break
9+
start += 1
10+
return start
11+
12+
def end_index(content, b):
13+
end = len(content) - 1
14+
while end > 0:
15+
if content[end] == b:
16+
break
17+
end -= 1
18+
return end
19+
20+
def unpack_json_block(content):
21+
new_content = ''
22+
count = 0
23+
char = 0
24+
start = start_index(content, '{')
25+
end = end_index(content, '}')
26+
content = content[start + 1:end]
27+
while char < len(content):
28+
if content[char] == '[':
29+
count += 1
30+
elif content[char] == ']':
31+
count -= 1
32+
if count > 0 and (content[char] == ' ' or content[char] == ','):
33+
new_content = new_content + '^'
34+
else:
35+
new_content = new_content + content[char]
36+
char += 1
37+
38+
for values in new_content.split(','):
39+
key = values[:values.find(':')]
40+
val = values[values.find(':'):]
41+
val = val.replace('^^', ' ').replace('^', ',')
42+
print('key ', key, ' val ', val)
43+
44+
45+
def unpack_json_file(content, op, cls):
46+
47+
new_content = ''
48+
count = 0
49+
char = 0
50+
start = start_index(content, '{')
51+
end = end_index(content, '}')
52+
content = content[start + 1:end]
53+
while char < len(content):
54+
55+
if content[char] == op:
56+
count += 1
57+
elif content[char] == cls:
58+
count -= 1
59+
if count > 0 and (content[char] == ' ' or content[char] == ','):
60+
if content[char] == ' ':
61+
new_content = new_content + '^'
62+
else:
63+
new_content = new_content + '***'
64+
else:
65+
new_content = new_content + content[char]
66+
char += 1
67+
68+
return new_content
69+
70+
def unittest(new_content):
71+
for values in new_content.split(','):
72+
key = values[:values.find(':')]
73+
val = values[values.find(':'):]
74+
val = val.replace('^^', ' ').replace('^', ',')
75+
print('key ', key, ' val ', val)
76+
77+
78+
79+
# import sys
80+
# unpack_json_object(open(sys.argv[1]).read(), '[', ']')
81+
82+
83+
84+

0 commit comments

Comments
 (0)