Skip to content

Commit 8fda884

Browse files
author
xuwenyihust
committed
Some updates.
1 parent 03aae5a commit 8fda884

File tree

3 files changed

+34
-16
lines changed

3 files changed

+34
-16
lines changed

conf/parameters.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44
"DStream": {
55
"batch_interval": "60",
66
"window_time": "60",
7-
"process_times": "30"
7+
"process_times": "60"
88
}
99
}

src/analysis.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def sentiment_analysis(lines, model, hashingTF, iDF):
8585
analysis.foreachRDD(lambda x: pos_cnt_li.extend(x.collect()))
8686

8787

88-
def data_to_db(db, start_time, counts, keywords, hashtags, pos):
88+
def data_to_db(db, start_time, counts, keywords, hashtags, pos, tracking_word, related_keywords_tb):
8989
# Store counts
9090
counts_t = []
9191
for i in range(min(len(counts), len(start_time))):
@@ -100,7 +100,8 @@ def data_to_db(db, start_time, counts, keywords, hashtags, pos):
100100
db['counts'].insert(counts_js)
101101
# Store keywords
102102
collection = db['keywords']
103-
db['keywords'].insert(keywords)
103+
if related_keywords_tb:
104+
db['keywords'].insert(keywords)
104105
# Store hashtags
105106
collection = db['hashtags']
106107
db['hashtags'].insert(hashtags)
@@ -115,6 +116,11 @@ def data_to_db(db, start_time, counts, keywords, hashtags, pos):
115116
ratio_js = json.loads(ratio_df.reset_index().to_json(orient='records'))
116117
collection = db['ratio']
117118
db['ratio'].insert(ratio_js)
119+
# Store tracking_word
120+
tracking_word_df = pd.DataFrame([tracking_word], columns=['Tracking_word'])
121+
tracking_word_js = json.loads(tracking_word_df.reset_index().to_json(orient='records'))
122+
collection = db['tracking_word']
123+
db['tracking_word'].insert(tracking_word_js)
118124

119125
def parseLine(line):
120126
parts = line.split('\t')
@@ -126,7 +132,7 @@ def parseLine(line):
126132
return (label, words)
127133

128134

129-
def main(sc, db):
135+
def main(sc, db, tracking_word):
130136

131137
print('>'*30+'SPARK START'+'>'*30)
132138

@@ -239,6 +245,9 @@ def main(sc, db):
239245
if len(sqlContext.tables().filter("tableName LIKE 'related_keywords_tmp'").collect()) == 1:
240246
top_words = sqlContext.sql( 'Select Keyword, Count from related_keywords_tmp' )
241247
related_keywords_df = related_keywords_df.unionAll(top_words)
248+
related_keywords_tb = True
249+
else:
250+
related_keywords_tb = False
242251

243252
# Find the top related hashtags
244253
if len(sqlContext.tables().filter("tableName LIKE 'related_hashtags_tmp'").collect()) == 1:
@@ -249,34 +258,41 @@ def main(sc, db):
249258
process_cnt += 1
250259

251260
# Final tables
252-
related_keywords_df = related_keywords_df.filter(related_keywords_df['Keyword'] != 'none')
253-
# Spark SQL to Pandas Dataframe
254-
related_keywords_pd = related_keywords_df.toPandas()
255-
related_keywords_pd = related_keywords_pd.groupby(related_keywords_pd['Keyword']).sum()
256-
related_keywords_pd = pd.DataFrame(related_keywords_pd)
257-
related_keywords_pd = related_keywords_pd.sort("Count", ascending=0).iloc[0:9]
261+
if related_keywords_tb:
262+
related_keywords_df = related_keywords_df.filter(related_keywords_df['Keyword'] != 'none')
263+
# Spark SQL to Pandas Dataframe
264+
related_keywords_pd = related_keywords_df.toPandas()
265+
related_keywords_pd = related_keywords_pd[related_keywords_pd['Keyword'] != tracking_word]
266+
related_keywords_pd = related_keywords_pd.groupby(related_keywords_pd['Keyword']).sum()
267+
related_keywords_pd = pd.DataFrame(related_keywords_pd)
268+
related_keywords_pd = related_keywords_pd.sort("Count", ascending=0).iloc[0:min(9, related_keywords_pd.shape[0])]
258269

259270
# Spark SQL to Pandas Dataframe
260-
related_hashtags_pd = related_hashtags_df.toPandas()
271+
related_hashtags_pd = related_hashtags_df.toPandas()
272+
related_hashtags_pd = related_hashtags_pd[related_hashtags_pd['Hashtag'] != '#'+tracking_word]
261273
related_hashtags_pd = related_hashtags_pd.groupby(related_hashtags_pd['Hashtag']).sum()
262274
related_hashtags_pd = pd.DataFrame(related_hashtags_pd)
263-
related_hashtags_pd = related_hashtags_pd.sort("Count", ascending=0).iloc[0:9]
275+
related_hashtags_pd = related_hashtags_pd.sort("Count", ascending=0).iloc[0:min(9, related_hashtags_pd.shape[0])]
264276

265277
ssc.stop()
266278
###########################################################################
267279

268280
print(tweet_cnt_li)
269281
print(start_time)
270282
print(pos_cnt_li)
283+
print(related_keywords_tb)
271284
#print(related_keywords_pd.head(10))
272285
#print(related_hashtags_pd.head(10))
273-
related_keywords_js = json.loads(related_keywords_pd.reset_index().to_json(orient='records'))
286+
if related_keywords_tb:
287+
related_keywords_js = json.loads(related_keywords_pd.reset_index().to_json(orient='records'))
288+
else:
289+
related_keywords_js = None
274290
#print(related_keywords_js)
275291
related_hashtags_js = json.loads(related_hashtags_pd.reset_index().to_json(orient='records'))
276292
#print(related_hashtags_js)
277293

278294
# Store the data to MongoDB
279-
data_to_db(db, start_time, tweet_cnt_li, related_keywords_js, related_hashtags_js, pos_cnt_li)
295+
data_to_db(db, start_time, tweet_cnt_li, related_keywords_js, related_hashtags_js, pos_cnt_li, tracking_word, related_keywords_tb)
280296

281297
print('>'*30+'SPARK STOP'+'>'*30)
282298

@@ -297,6 +313,7 @@ def main(sc, db):
297313
# Load parameters
298314
with open('conf/parameters.json') as f:
299315
p = json.load(f)
316+
tracking_word = p['hashtag'][1:]
300317
batch_interval = int(p['DStream']['batch_interval'])
301318
window_time = int(p['DStream']['window_time'])
302319
process_times = int(p['DStream']['process_times'])
@@ -308,6 +325,7 @@ def main(sc, db):
308325
db['keywords'].drop()
309326
db['hashtags'].drop()
310327
db['ratio'].drop()
328+
db['tracking_word'].drop()
311329
# Execute main function
312-
main(sc, db)
330+
main(sc, db, tracking_word)
313331

web/static/count.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ d3.json("/data/counts", function(error, data) {
5757
.text("Count");
5858

5959
var line = d3.svg.line()
60-
.x(function(d) { return 40+count_x(d.Time); })
60+
.x(function(d) { return 5+count_x(d.Time); })
6161
.y(function(d) { return count_y(d.Count);});
6262

6363

0 commit comments

Comments
 (0)