-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_massive_news.py
60 lines (49 loc) · 1.48 KB
/
gen_massive_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import spacy
import os
from pathlib import Path
import re
from multiprocessing import Process
import atexit
import time
class ParseWorker(Process):
def __init__(self, file_list, numeral):
super().__init__()
self.nlp = spacy.load('en')
self.file_list = file_list
self.numeral = numeral
self.count = 0
def run(self):
with open("data/giant_news" + self.numeral + ".txt", "w") as blob:
for filename in self.file_list:
self.count += 1
contents = Path("./data/stories/" + filename).read_text()
start = contents.find("(CNN) --")
if start == -1:
start = 0
else:
start += 8
contents = contents[0 : contents.find("@highlight")].strip().replace("(CNN)", "")
doc = self.nlp(contents)
for sentence in list(doc.sents):
if len(sentence.text) > 20:
safe = re.sub(r'[^\x00-\x7f]',r'', sentence.text.replace("\n", ""))
blob.write(safe + "\n")
print("Thread" + self.numeral + " filenum" + str(self.count))
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
workerList = []
delegate = list(chunks(os.listdir("./data/stories/"), 15000)) # 10,000 stories per txt file
for i in range(len(delegate)):
worker = ParseWorker(delegate[i], str(i))
worker.daemon = True
worker.start()
workerList.append(worker)
def close_running_threads():
for thread in workerList:
thread.join()
atexit.register(close_running_threads)
print("registered")
while True:
time.sleep(1)