MicroCompGenomics/CompareClusters_OtherGenomes.py at master · juanu/MicroCompGenomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#Created on 3/10/2015
__author__ = 'Juan A. Ugalde'

if __name__ == '__main__':
    from collections import defaultdict
    import argparse
    from tools.data_input.GenomeData import read_genome_list
    from tools.data_input.ClusterInput import parse_fastortho, parse_tribemcl

    #Create the options and program description
    program_description = "This script evaluates if from a list of clusters, how many genomes from another list are" \
                          "present in those clusters. The idea is to use this to look for cluster that are either" \
                          "under or over represented in a group of genomes." \
                          " The inputs are:" \
                          "-l genome list. The list of genomes to evaluate with the clusters."\
                          "-c cluster file. Original cluster file generated by either TribeMCL or OrthoMCL"\
                          "-t clustering algorithm. Either tribemcl or orthomcl"\
                          "-q clusters to query the composition" \

    parser = argparse.ArgumentParser(description=program_description)

    parser.add_argument("-l", "--genome_list_index", type=str,
                        help="File with the genomes to search in the cluster. Format GenomeID, FullName, ShortName",
                        required=True)
    parser.add_argument("-c", "--cluster_file", type=str, help="Ortholog file", required=True)
    parser.add_argument("-t", "--clustering_algorithm", type=str, help="Type of clustering used. Options are fastortho,"
                                                                       "tribemcl", required=True)
    parser.add_argument("-q", "--query_clusters", type=str, help="Query clusters")
    parser.add_argument("-o", "--output_file", type=str, help="Output file", required=True)

    args = parser.parse_args()

    ##Read the genome list
    genome_id_dictionary, genome_count = read_genome_list(args.genome_list_index)

    prefix_name_genome = {v: k for k, v in genome_id_dictionary.items()}

    ##Read the cluster files
    #Create the output variables
    cluster_information = None
    set_of_proteins_in_clusters = None
    unique_cluster_count = None
    total_clusters = None
    removed_clusters = None

    if args.clustering_algorithm == "fastortho":
        cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
            parse_fastortho(args.cluster_file, genome_id_dictionary.keys())

    elif args.clustering_algorithm == "tribemcl":
           cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
               parse_tribemcl(args.cluster_file, genome_id_dictionary.keys())


    #Iterate over the query clusters
    with open(args.query_clusters) as fp:
        for line in fp:
            if line.strip():
                line = line.rstrip()

                #Get the list of the proteins in the cluster
                proteins_in_cluster = cluster_information[line]

                #Get the genomes of the proteins
                genome_cluster = defaultdict()

                for protein in proteins_in_cluster:
                    prot_id, genome = protein.split("|")

                    if genome in prefix_name_genome.keys():
                        genome_cluster[genome] += 1

                    else:
                        print genome
                        print prefix_name_genome

                print line + "\t" + str(len(genome_cluster.keys())) + "/" + str(len(genome_id_dictionary.keys()))