Skip to content

Commit 72f48de

Browse files
Data parsing function
1 parent bc7fda7 commit 72f48de

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

data_parsing_function.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
def merge(names,nodes,delnodes,merged):
2+
3+
#import librarys.
4+
import csv
5+
import pandas as pd
6+
7+
#create empty DataFrame df_names and df_nodes.
8+
df_names = pd.DataFrame()
9+
df_nodes = pd.DataFrame()
10+
11+
#read .dmp files with use of parameters.
12+
df_names = pd.read_csv(names,delimiter='\t',header=None)
13+
df_nodes = pd.read_csv(nodes,delimiter='\t',header=None)
14+
15+
#collect usefull columns from DataFrames.
16+
df_nodes = df_nodes[[0,2,4]]
17+
df_names = df_names[[0, 2, 6]]
18+
19+
#apply key on column 6 in df_named and extract column 0 and 2.
20+
df_names = df_names[df_names[6]=='scientific name'][[0,2]]
21+
22+
#lenth of dataframe in range(.....).
23+
df_names.index = range(len(df_names))
24+
25+
#merge both DataFrames df_nodes and df_names in new DataFrame df_merge.
26+
df_merge = pd.DataFrame()
27+
df_merge = df_nodes.merge(df_names[[0,2]],how = "left",left_on = [0],right_on= [0])
28+
29+
#rename columns of DataFrame df_merge.
30+
df_merge = df_merge.rename(columns={0:'taxa_ids','2_x':'parent_id',4:'rank','2_y':'scientfic_name'})
31+
32+
#create empty DataFrame df_delnodes and df_merged.
33+
df_delnodes = pd.DataFrame()
34+
df_merged = pd.DataFrame()
35+
#read .dmp files with use of parameters.
36+
df_delnodes = pd.read_csv(delnodes,delimiter='\t',header=None)
37+
df_merged = pd.read_csv(merged,delimiter='\t',header=None)
38+
39+
#creat two empty lists taxaid and taxaids assign them with set of df_merge['taxa_ids'] and df_merged[0].
40+
taxaid = list(set(df_merge['taxa_ids']))
41+
taxaids = list(set(df_merged[0]))
42+
43+
#print common values in lists.
44+
print("modified values are",list(set(taxaid) & set(taxaids)))
45+
46+
47+
#convert DataFrame in .csv formate and assign to file_csv .
48+
file_csv = df_merge.to_csv("final_merge.csv")
49+
print ("csv file created as final_merge.csv")
50+
51+
#return file_csv
52+
return file_csv, len(df_merge)
53+
54+
55+
56+
57+
#pass path of .dmp files in parameters.
58+
merge(names = 'taxadmp/names.dmp',nodes = 'taxadmp/nodes.dmp',merged = 'taxadmp/merged.dmp',delnodes = 'taxadmp/delnodes.dmp')

0 commit comments

Comments
 (0)