Skip to content

Commit 0241991

Browse files
committed
uploading workload execution files
0 parents  commit 0241991

30 files changed

+249145
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# canary

datamining/divindex.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import subprocess
2+
import time
3+
4+
def main(args):
5+
initTime = time.time()
6+
invocation = "java -jar /USCensus.jar US_diversity_index"
7+
__post(invocation)
8+
print("INFO ::> Job Completed @ {0} seconds".format(time.time()-initTime))
9+
return {"Status": "Job completed"}
10+
11+
12+
def __post(command):
13+
response = subprocess.run(['/bin/bash', '-c', command], stdout=subprocess.PIPE,
14+
stderr=subprocess.STDOUT).stdout.decode().strip()
15+
print('Result:', response)
16+

datamining/usdivindex.log

Lines changed: 247192 additions & 0 deletions
Large diffs are not rendered by default.

dbquery/dbquery.log

Lines changed: 802 additions & 0 deletions
Large diffs are not rendered by default.

dbquery/prepare.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import sys
2+
import psycopg2
3+
import json
4+
from psycopg2.extras import Json
5+
# Redundant function
6+
# Used to find nested json keys
7+
def id_generator(dict_var):
8+
for k, v in dict_var.items():
9+
v = isinstance(v, str) and v.replace("True", "true").replace("False", "false").replace('"', "'").replace("'", '"')
10+
try:
11+
v = json.loads(v)
12+
print(f'Parsed: {k}: {v}')
13+
except:
14+
print(f'Cannot parse {k}: {v}')
15+
pass
16+
# Dont show any error
17+
if isinstance(v, dict):
18+
for id_val in id_generator(v):
19+
yield id_val
20+
else:
21+
yield k
22+
def process_business_attributes():
23+
conn = psycopg2.connect(host='172.17.0.6', user='postgres', dbname='canary_db')
24+
cur = conn.cursor()
25+
cur.execute("SELECT * FROM business_attribute")
26+
rows = cur.fetchall()
27+
ops = []
28+
for i, r in enumerate(rows):
29+
isUpdated = False
30+
attributes = r[1]
31+
for k, v in attributes.items():
32+
if isinstance(v, str) and '{' in v and '}' in v:
33+
# Replace True and False to true and false respectively as these are the standard JSON notations.
34+
# Replace the single quotes with double and the double ones with single, this is a problem in the source data. JSON data needs to have the keys enclosed in double quotes.
35+
v = v.replace("True", "true").replace("False", "false").replace('"', "'").replace("'", '"')
36+
try:
37+
v = json.loads(v)
38+
attributes[k] = v
39+
isUpdated = True
40+
except:
41+
pass
42+
if isUpdated:
43+
ops.append({"business_id": r[0], "attributes": Json(attributes)})
44+
print(f'Starting to update {len(ops)} rows in business_attribute table...')
45+
# The following query takes around 50 minutes
46+
cur.executemany("UPDATE business_attribute SET attribute = %(attributes)s WHERE business_id = %(business_id)s", ops)
47+
print(f'Updated {len(ops)} rows in business_attributes')
48+
# This query takes around 13 seconds
49+
cur.execute(
50+
"""
51+
CREATE TABLE business_attribute_temp AS (with recursive flat(business_id, key, value) as
52+
(
53+
SELECT business_id, key, value FROM business_attribute, jsonb_each(attribute)
54+
UNION
55+
SELECT f.business_id, concat(f.key, '.', j.key), j.value FROM flat f, jsonb_each(f.value) j WHERE jsonb_typeof(f.value) = 'object'
56+
)
57+
SELECT business_id, jsonb_object_agg(key, value) as data from flat WHERE jsonb_typeof(value) <> 'object' GROUP BY business_id);
58+
create or replace function create_jsonb_flat_view
59+
(table_name text, regular_columns text, json_column text)
60+
returns text language plpgsql as $$
61+
declare
62+
cols text;
63+
begin
64+
execute format ($ex$
65+
select string_agg(format('%2$s->>%%1$L "%%1$s"', key), ', ')
66+
from (
67+
select distinct key
68+
from %1$s, jsonb_each(%2$s)
69+
order by 1
70+
) s;
71+
$ex$, table_name, json_column)
72+
into cols;
73+
execute format($ex$
74+
drop view if exists %1$s_view;
75+
create view %1$s_view as
76+
select %2$s, %3$s from %1$s
77+
$ex$, table_name, regular_columns, cols);
78+
return cols;
79+
end $$;
80+
SELECT create_jsonb_flat_view('business_attribute_temp', 'business_id', 'data');
81+
CREATE TABLE business_attributes_exploded AS (SELECT * FROM business_attribute_temp_view);
82+
ALTER TABLE business_attributes_exploded ADD PRIMARY KEY (business_id);
83+
"""
84+
)
85+
cur.close()
86+
conn.commit()
87+
88+
process_business_attributes()

dbquery/querydb.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/python3
2+
#!/usr/bin/env python3
3+
import psycopg2
4+
import subprocess, time
5+
from psycopg2.extras import Json
6+
7+
def main(args):
8+
initTime = time.time()
9+
pg_hostname = subprocess.run(['/bin/bash', '-c', "docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' canary-mongodb"], stdout=subprocess.PIPE,stderr=subprocess.PIPE,universal_newlines=True)
10+
pg_username = "postgres"
11+
pg_password = "password"
12+
pg_db = "canary_db"
13+
#conn = psycopg2.connect(f'host={pg_hostname} user={pg_username} dbname={pg_db}')
14+
conn = psycopg2.connect(host='172.17.0.6', user='postgres', dbname='canary_db')
15+
cur = conn.cursor()
16+
print("CANARY ::> Connection Establish -> Time = {0}.s".format(time.time()-initTime))
17+
18+
for state in range(100):
19+
print("CANARY ::> Iteration {0} starting -> Time = {1}.s".format(state, time.time()-initTime))
20+
cur.execute("select t1.business_id, t1.name, t1.city, t1.state, t1.stars, t1.review_count, count(t2.id) as checkins, sum(t3.cool) as cool_reviews from business_info t1 JOIN checkin_info t2 on t1.business_id=t2.business_id JOIN review_info t3 on t2.business_id=t3.business_id where t1.city ilike 'Pittsburgh' group by t1.business_id order by checkins desc limit 10;")
21+
print("CANARY ::> Query 1 completed -> Time = {0}.s".format(time.time()-initTime))
22+
23+
cur.execute("select t1.business_id, t1.name, t1.city, t1.address, t1.review_count, t2.closing_hour, t3.category, t5.stars from business_info t1 JOIN business_hour t2 on t1.business_id=t2.business_id JOIN business_category t3 on t2.business_id=t3.business_id JOIN review_info t5 on t3.business_id=t5.business_id JOIN business_attributes_exploded t4 on t5.business_id=t4.business_id where t1.city='Las Vegas' AND t2.closing_hour > '22:00:00' AND t3.category='Restaurants' AND t5.stars > 3 AND t4.\"GoodForKids\" ilike 'True';")
24+
print("CANARY ::> Query 2 completed -> Time = {0}.s".format(time.time()-initTime))
25+
26+
cur.execute("select t1.business_id, t1.name, t1.city, t1.state, t1.address, t1.stars, t2.category from business_info t1 JOIN business_category t2 on t1.business_id=t2.business_id where t1.stars>4 AND t1.state='NY' AND t2.category='Sandwiches' limit 50;")
27+
print("CANARY ::> Query 3 completed -> Time = {0}.s".format(time.time()-initTime))
28+
29+
cur.execute("select t1.business_id, t1.name, t1.city, t1.state, t1.address, t1.stars, t1.review_count, t3.category from business_info t1 JOIN review_info t2 on t1.business_id=t2.business_id JOIN business_category t3 on t2.business_id=t3.business_id where t2.date > '2005-01-01 00:00:00' AND t2.date < '2006-01-01 00:00:00' AND t3.category ILIKE '%resort%' GROUP BY t1.business_id, t3.category ORDER BY t1.review_count DESC limit 50;")
30+
print("CANARY ::> Query 4 completed -> Time = {0}.s".format(time.time()-initTime))
31+
32+
cur.execute("select t1.business_id, t1.name, t1.city, t1.state, t1.stars, t1.review_count from business_info t1 where is_open=0 and stars <2 limit 10;")
33+
print("CANARY ::> Query 5 completed -> Time = {0}.s".format(time.time()-initTime))
34+
35+
cur.execute("select t1.city, t1.state, count(t2.review_id) as total_reviews from business_info t1 JOIN review_info t2 on t1.business_id=t2.business_id group by t1.city,t1.state order by total_reviews desc limit 10; ")
36+
print("CANARY ::> Query 6 completed -> Time = {0}.s".format(time.time()-initTime))
37+
print("CANARY ::> State {0} collection -> Time = {1}.s".format(state, time.time()-initTime))
38+
39+
cur.close()
40+
conn.commit()
41+
print("CANARY ::> Job completed -> Time = {0}.s".format(time.time()-initTime))
42+
43+
return {'CANARY STATUS': "Queries completed successfully."}
44+

0 commit comments

Comments
 (0)