First push

AlysonNgonyama · AlysonNgonyama · commit f3ba9d77006b · 2019-06-03T09:23:42.000+02:00
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1,79 @@
+
+# BigQuery Autogen Schema
+
+This is a tool used to convert .JSON strings (inside JSON files) into acceptable Python BigQuery schema lists.
+
+It's not completely full proof as support for key/value pairs in JSON files where the value is a whitespace delimited string fails to parse properly. Open to any patches (or I'll just patch it sometime).
+
+### Example:
+JSON input file
+```
+cat testcase/test.json
+{"elements_0_role": "ADMINISTRATOR", "elements_0_roleAssignee": "urn:li:person:AFnCO-Sd9a", "elements_0_state": "APPROVED", "elements_0_organizationalTarget": "dotmodus", "paging_count": 10, "paging_start": 0, "fake_field": {"name": "Sephiroth", "age": 99}, "final_record": 42 }
+```
+Output :
+```
+python3 bigquery_schema_gen.py testcase/test.json
+schema = [
+            {
+                "mode": "NULLABLE",
+                "name": "elements_0_role",
+                "type": "STRING"
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "elements_0_roleAssignee",
+                "type": "STRING"
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "elements_0_state",
+                "type": "STRING"
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "elements_0_organizationalTarget",
+                "type": "STRING"
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "paging_count",
+                "type": "INTEGER"
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "paging_start",
+                "type": "INTEGER"
+            },
+            {
+                "mode": "REPEATED",
+                "name": "fake_field",
+                "type": "RECORD",
+                "fields": [
+                    {
+                        "mode": "NULLABLE",
+                        "name": "name",
+                        "type": "STRING"
+                    },
+                    {
+                        "mode": "NULLABLE",
+                        "name": "age",
+                        "type": "INTEGER"
+                    },
+                ]
+            },
+            {
+                "mode": "NULLABLE",
+                "name": "final_record",
+                "type": "INTEGER"
+            },
+]
+```
+## Important
+A lot of the parsing will be handled for basic fields, but the resulting Python schema must be looked over for inconsisencies.
+
+
+- Brutally hacked together on a Friday afternoon by AlysonNgonyama
+
+
+
diff --git a/__pycache__/unpack.cpython-36.pyc b/__pycache__/unpack.cpython-36.pyc
diff --git a/bigquery_schema_gen.py b/bigquery_schema_gen.py
@@ -0,0 +1,125 @@
+
+import sys
+from unpack import unpack_json_file, start_index, end_index
+
+def query_level(level):
+	x = 0
+	while x < level:
+		print('    ', end='')
+		x += 1
+
+def small_indent(level):
+	x = 0
+	while x < level:	
+		print('  ', end='')
+		x += 1
+
+def display_field(mode, name, type_name, indent_level):
+	mode = mode + ','
+	name = name + ','
+	if type_name == '"RECORD",':
+		mode = '"REPEATED",'
+	query_level(indent_level+2)
+	print('{')
+	query_level(indent_level+3)
+	print('"mode":', mode)
+	query_level(indent_level+3)
+	print('"name":', name)
+	query_level(indent_level+3)
+	print('"type":', type_name)
+
+
+def repeated_loop(val, indent_level, key):
+	val = val[start_index(val, '[') + 2:end_index(val, ']') - 1]
+	repeated_items = val.split('} {')
+	for item in repeated_items:
+		item = '{' + item + '}'
+		bigquery_schema_gen(item.replace(' ', ',').replace(':,', ':'), indent_level+1)
+
+def convert_to_schema_block(key, val, indent_level):
+	schema_value_type = ''
+	val = val.strip()
+	if len(val) == 0:
+		return
+	if val.isdigit():
+		schema_value_type = '"INTEGER"'
+	elif val[0] == '[':
+		schema_value_type = '"RECORD",'
+		display_field('"NULL"', key.strip(), schema_value_type, indent_level)
+		query_level(indent_level+3)
+		print('"fields": [')
+		repeated_loop(val, indent_level+1, key.strip())
+		query_level(indent_level+3)
+		print(']')
+		query_level(indent_level+2)
+		print('},')
+		return 
+	elif val[0] == '{':
+		schema_value_type = '"RECORD",'
+		display_field('"NULLABLE"', key.strip(), schema_value_type, indent_level)
+		query_level(indent_level+3)
+		print('"fields": [')
+		bigquery_schema_gen(val.strip(), indent_level+2)
+		query_level(indent_level+3)
+		print(']')
+		query_level(indent_level+2)
+		print('},')
+		return 
+	elif val[0] == '"':
+		schema_value_type = '"STRING"'
+	elif '.' in val:
+		schema_value_type = '"FLOAT"'
+	display_field('"NULLABLE"', key.strip(), schema_value_type, indent_level)
+	query_level(indent_level+2)
+	print('},')
+
+
+
+def cleanly_separate_key_values(line):
+	"""Find the delimiter that separates key from value.
+
+	Splitting with .split() often yields inaccurate results
+	as some values have the same delimiter value ':', splitting 
+	the string too inaccurately.
+	"""
+	index = line.find(':')
+	key = line[:index]
+	value = line[index + 1:]
+	return key, value
+	
+	
+def bigquery_schema_gen(output, indent_level):
+	"""Automate the generation of BigQuery schema if json file is flattened."""
+	new_content = output	
+	if '[' in output:
+		new_content = unpack_json_file(output, '[', ']')
+	if '{' in output:
+		new_content = unpack_json_file(output, '{', '}')			
+	for values in new_content.split(','):
+		key = values[:values.find(':')]
+		val = values[values.find(':') + 1:]
+		val = val.replace('***', ',').replace('^', ' ')
+		convert_to_schema_block(key, val, indent_level)
+
+
+if len(sys.argv) != 2:
+	print('Usage : %s [json file]' % sys.argv[0])
+	sys.exit(1)
+if not sys.argv[1].endswith('json'):
+	print('Error : file must be a .json file')
+	sys.exit(1)
+
+
+print('schema = [')
+bigquery_schema_gen(open(sys.argv[1]).read(), 1)
+print(']')
+
+
+
+
+
+
+
+
+
+
diff --git a/testcase/test.json b/testcase/test.json
@@ -0,0 +1 @@
+{"elements_0_role": "ADMINISTRATOR", "elements_0_roleAssignee": "urn:li:person:AFnCO-Sd9a", "elements_0_state": "APPROVED", "elements_0_organizationalTarget": "dotmodus", "paging_count": 10, "paging_start": 0, "fake_field": {"name": "Sephiroth", "age": 99}, "final_record": 42 }
diff --git a/testcase/test2.json b/testcase/test2.json
@@ -0,0 +1,2 @@
+{"id":"1","first_name":"John","last_name":"Doe","dob":"1968-01-22","addresses":[{"status":"current","address":"123FirstAvenue","city":"Seattle","state":"WA","zip":"11111","numberOfYears":"1"},{"status":"previous","address":"456MainStreet","city":"Portland","state":"OR","zip":"22222","numberOfYears":"5"}]}
+
diff --git a/testcase/testfile b/testcase/testfile
@@ -0,0 +1,84 @@
+            {
+            "mode": "NULLABLE"
+            "name": "elements"
+            "type": "REPEATED"
+                    {
+                    "mode": "NULLABLE"
+                    "name": "totalShareStatistics"
+                    "type": "RECORD"
+                    "fields": [
+                        {
+                        "mode": "NULLABLE"
+                        "name": "shareCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "uniqueImpressionsCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "clickCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "engagement"
+                        "type": "FLOAT"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "shareMentionsCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "likeCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "impressionCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "commentMentionsCount"
+                        "type": "INTEGER"
+                        }
+                        {
+                        "mode": "NULLABLE"
+                        "name": "commentCount"
+                        "type": "INTEGER"
+                        }
+                    ]
+                    }
+                {
+                "mode": "NULLABLE"
+                "name": "organizationalEntity"
+                "type": "STRING"
+                }
+        }
+                {
+                "mode": "NULLABLE"
+                "name": "paging"
+                "type": "RECORD"
+                "fields": [
+                    {
+                    "mode": "NULLABLE"
+                    "name": "count"
+                    "type": "INTEGER"
+                    }
+                    {
+                    "mode": "NULLABLE"
+                    "name": "start"
+                    "type": "INTEGER"
+                    }
+                    {
+                    "mode": "NULLABLE"
+                    "name": "links"
+                    "type": "REPEATED"
+                }
+                ]
+                }
diff --git a/unpack.py b/unpack.py
@@ -0,0 +1,84 @@
+
+
+
+def start_index(content, b):
+	start = 0
+	while start < len(content):
+		if content[start] == b:
+			break
+		start += 1
+	return start
+
+def end_index(content, b):
+	end = len(content) - 1
+	while end > 0:
+		if content[end] == b:
+			break
+		end -= 1
+	return end
+
+def unpack_json_block(content):
+	new_content = ''
+	count = 0
+	char = 0
+	start = start_index(content, '{')
+	end = end_index(content, '}')
+	content = content[start + 1:end]
+	while char < len(content):
+		if content[char] == '[':
+			count += 1
+		elif content[char] == ']':
+			count -= 1
+		if count > 0 and (content[char] == ' ' or content[char] == ','):
+			new_content = new_content + '^'
+		else:
+			new_content = new_content + content[char]
+		char += 1
+
+	for values in new_content.split(','):
+		key = values[:values.find(':')]
+		val = values[values.find(':'):]
+		val = val.replace('^^', ' ').replace('^', ',')
+		print('key ', key, ' val ', val)
+		
+
+def unpack_json_file(content, op, cls):
+	
+	new_content = ''
+	count = 0
+	char = 0
+	start = start_index(content, '{')
+	end = end_index(content, '}')
+	content = content[start + 1:end]
+	while char < len(content):
+		
+		if content[char] == op:
+			count += 1
+		elif content[char] == cls:
+			count -= 1
+		if count > 0 and (content[char] == ' ' or content[char] == ','):
+			if content[char] == ' ':
+				new_content = new_content + '^'
+			else:	
+				new_content = new_content + '***'
+		else:
+			new_content = new_content + content[char]	
+		char += 1
+
+	return new_content
+
+def unittest(new_content):
+	for values in new_content.split(','):
+		key = values[:values.find(':')]
+		val = values[values.find(':'):]
+		val = val.replace('^^', ' ').replace('^', ',')
+		print('key ', key, ' val ', val)
+		
+
+
+# import sys
+# unpack_json_object(open(sys.argv[1]).read(), '[', ']')
+
+
+
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"elements_0_role": "ADMINISTRATOR", "elements_0_roleAssignee": "urn:li:person:AFnCO-Sd9a", "elements_0_state": "APPROVED", "elements_0_organizationalTarget": "dotmodus", "paging_count": 10, "paging_start": 0, "fake_field": {"name": "Sephiroth", "age": 99}, "final_record": 42 }`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"id":"1","first_name":"John","last_name":"Doe","dob":"1968-01-22","addresses":[{"status":"current","address":"123FirstAvenue","city":"Seattle","state":"WA","zip":"11111","numberOfYears":"1"},{"status":"previous","address":"456MainStreet","city":"Portland","state":"OR","zip":"22222","numberOfYears":"5"}]}`
	`2`	`+`