forked from tiantiantu/KSI
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing1.py
More file actions
92 lines (65 loc) · 2.04 KB
/
preprocessing1.py
File metadata and controls
92 lines (65 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import codecs
from collections import defaultdict
import csv
import string
from stop_words import get_stop_words # download stop words package from https://pypi.org/project/stop-words/
import numpy as np
stop_words = get_stop_words('english')
admidic=defaultdict(list)
count=0
with open('NOTEEVENTS.csv', 'r') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in spamreader:
if row[6]=='Discharge summary':
admidic[row[2]].append(row[-1].replace('\n',' ').translate(str.maketrans('','',string.punctuation)).lower())
count=count+1
u=defaultdict(int)
for i in admidic:
for jj in admidic[i]:
line=jj.strip('\n').split()
for j in line:
u[j]=u[j]+1
u2=defaultdict(int)
for i in u:
if i.isdigit()==False:
if u[i]>10:
if i not in stop_words:
u2[i]=u[i]
u=[]
file1=codecs.open('DIAGNOSES_ICD.csv','r')
ad2c=defaultdict(list)
line=file1.readline()
line=file1.readline()
while line:
line=line.strip().split(',')
if line[4][1:-1]!='':
ad2c[line[2]].append("d_"+line[4][1:-1])
line=file1.readline()
codeu=defaultdict(int)
for i in ad2c:
for j in ad2c[i]:
codeu[j]=codeu[j]+1
cthre=0
fileo=codecs.open("combined_dataset",'w')
IDlist=np.load('IDlist.npy',encoding='bytes').astype(str)
for i in IDlist:
if ad2c[i]!=[]:
fileo.write('start! '+i+'\n')
fileo.write('codes: ')
tempc=[]
for code in ad2c[i]:
if codeu[code]>=cthre:
if code[0:5] not in tempc:
tempc.append(code[0:5])
for code in tempc:
fileo.write(code+" ")
fileo.write('\n')
fileo.write('notes:\n')
for line in admidic[i]:
thisline=line.strip('\n').split()
for j in thisline:
if u2[j]!=0:
fileo.write(j+" ")
fileo.write('\n')
fileo.write('end!\n')
fileo.close()