|
| 1 | +def merge(names,nodes,delnodes,merged): |
| 2 | + |
| 3 | + #import librarys. |
| 4 | + import csv |
| 5 | + import pandas as pd |
| 6 | + |
| 7 | + #create empty DataFrame df_names and df_nodes. |
| 8 | + df_names = pd.DataFrame() |
| 9 | + df_nodes = pd.DataFrame() |
| 10 | + |
| 11 | + #read .dmp files with use of parameters. |
| 12 | + df_names = pd.read_csv(names,delimiter='\t',header=None) |
| 13 | + df_nodes = pd.read_csv(nodes,delimiter='\t',header=None) |
| 14 | + |
| 15 | + #collect usefull columns from DataFrames. |
| 16 | + df_nodes = df_nodes[[0,2,4]] |
| 17 | + df_names = df_names[[0, 2, 6]] |
| 18 | + |
| 19 | + #apply key on column 6 in df_named and extract column 0 and 2. |
| 20 | + df_names = df_names[df_names[6]=='scientific name'][[0,2]] |
| 21 | + |
| 22 | + #lenth of dataframe in range(.....). |
| 23 | + df_names.index = range(len(df_names)) |
| 24 | + |
| 25 | + #merge both DataFrames df_nodes and df_names in new DataFrame df_merge. |
| 26 | + df_merge = pd.DataFrame() |
| 27 | + df_merge = df_nodes.merge(df_names[[0,2]],how = "left",left_on = [0],right_on= [0]) |
| 28 | + |
| 29 | + #rename columns of DataFrame df_merge. |
| 30 | + df_merge = df_merge.rename(columns={0:'taxa_ids','2_x':'parent_id',4:'rank','2_y':'scientfic_name'}) |
| 31 | + |
| 32 | + #create empty DataFrame df_delnodes and df_merged. |
| 33 | + df_delnodes = pd.DataFrame() |
| 34 | + df_merged = pd.DataFrame() |
| 35 | + #read .dmp files with use of parameters. |
| 36 | + df_delnodes = pd.read_csv(delnodes,delimiter='\t',header=None) |
| 37 | + df_merged = pd.read_csv(merged,delimiter='\t',header=None) |
| 38 | + |
| 39 | + #creat two empty lists taxaid and taxaids assign them with set of df_merge['taxa_ids'] and df_merged[0]. |
| 40 | + taxaid = list(set(df_merge['taxa_ids'])) |
| 41 | + taxaids = list(set(df_merged[0])) |
| 42 | + |
| 43 | + #print common values in lists. |
| 44 | + print("modified values are",list(set(taxaid) & set(taxaids))) |
| 45 | + |
| 46 | + |
| 47 | + #convert DataFrame in .csv formate and assign to file_csv . |
| 48 | + file_csv = df_merge.to_csv("final_merge.csv") |
| 49 | + print ("csv file created as final_merge.csv") |
| 50 | + |
| 51 | + #return file_csv |
| 52 | + return file_csv, len(df_merge) |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | + |
| 57 | +#pass path of .dmp files in parameters. |
| 58 | +merge(names = 'taxadmp/names.dmp',nodes = 'taxadmp/nodes.dmp',merged = 'taxadmp/merged.dmp',delnodes = 'taxadmp/delnodes.dmp') |
0 commit comments