-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCompareClusters_OtherGenomes.py
More file actions
77 lines (59 loc) · 3.51 KB
/
CompareClusters_OtherGenomes.py
File metadata and controls
77 lines (59 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#Created on 3/10/2015
__author__ = 'Juan A. Ugalde'
if __name__ == '__main__':
from collections import defaultdict
import argparse
from tools.data_input.GenomeData import read_genome_list
from tools.data_input.ClusterInput import parse_fastortho, parse_tribemcl
#Create the options and program description
program_description = "This script evaluates if from a list of clusters, how many genomes from another list are" \
"present in those clusters. The idea is to use this to look for cluster that are either" \
"under or over represented in a group of genomes." \
" The inputs are:" \
"-l genome list. The list of genomes to evaluate with the clusters."\
"-c cluster file. Original cluster file generated by either TribeMCL or OrthoMCL"\
"-t clustering algorithm. Either tribemcl or orthomcl"\
"-q clusters to query the composition" \
parser = argparse.ArgumentParser(description=program_description)
parser.add_argument("-l", "--genome_list_index", type=str,
help="File with the genomes to search in the cluster. Format GenomeID, FullName, ShortName",
required=True)
parser.add_argument("-c", "--cluster_file", type=str, help="Ortholog file", required=True)
parser.add_argument("-t", "--clustering_algorithm", type=str, help="Type of clustering used. Options are fastortho,"
"tribemcl", required=True)
parser.add_argument("-q", "--query_clusters", type=str, help="Query clusters")
parser.add_argument("-o", "--output_file", type=str, help="Output file", required=True)
args = parser.parse_args()
##Read the genome list
genome_id_dictionary, genome_count = read_genome_list(args.genome_list_index)
prefix_name_genome = {v: k for k, v in genome_id_dictionary.items()}
##Read the cluster files
#Create the output variables
cluster_information = None
set_of_proteins_in_clusters = None
unique_cluster_count = None
total_clusters = None
removed_clusters = None
if args.clustering_algorithm == "fastortho":
cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
parse_fastortho(args.cluster_file, genome_id_dictionary.keys())
elif args.clustering_algorithm == "tribemcl":
cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
parse_tribemcl(args.cluster_file, genome_id_dictionary.keys())
#Iterate over the query clusters
with open(args.query_clusters) as fp:
for line in fp:
if line.strip():
line = line.rstrip()
#Get the list of the proteins in the cluster
proteins_in_cluster = cluster_information[line]
#Get the genomes of the proteins
genome_cluster = defaultdict()
for protein in proteins_in_cluster:
prot_id, genome = protein.split("|")
if genome in prefix_name_genome.keys():
genome_cluster[genome] += 1
else:
print genome
print prefix_name_genome
print line + "\t" + str(len(genome_cluster.keys())) + "/" + str(len(genome_id_dictionary.keys()))