protein-consensus-sequence/consensus.py at master · LLei2/protein-consensus-sequence · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python3

import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import collections
import sys
import csv
import modules

if __name__ == '__main__':

    #set path to current directory
    path = os.getcwd() + '/'

    #make directory for outputs if it does not exist
    if not os.path.exists('Outputs/'):
        os.makedirs('Outputs/')

    parser = argparse.ArgumentParser()

    parser.add_argument('-i', required=True, metavar = 'Input FASTA file.', help='Filename for FASTA alignment')
    parser.add_argument('-o', default='out.txt', metavar = 'Output gap stripped FASTA file name', help='Output FASTA filename. If not given will use name of input FASTA file as template to name output files.')
    parser.add_argument('-c', default='0', metavar = 'Method for removing insertions', help='Desired method for removing insertions. 1 = Positions with gap frequencies < threshold (0.5 default, change with -t flag). 2 =  Positions with residue as most frequent character. 3 = Positions with residues in a specific sequence. If not given will ask for user input upon running script. See README for further explantion of methods.')
    parser.add_argument('-t', type=float, default = 0.5, metavar = 'Gap frequency threshold', help='Gap frequecy threshold to define a consensus positions. Only valid for Option 1 for removing insertions. Must be a value between 0 and 1 (default: 0.5)')
    parser.add_argument('-f', action='store_true', help='Include flag to prevent saving images of MSA data analysis.')

    args = parser.parse_args()

    #Show help if no arguments passed
    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(1)

    #FIGURE SETTINGS
    mpl.rcParams['axes.titlesize'] = 18
    mpl.rcParams['axes.labelsize'] = 18
    mpl.rcParams['xtick.labelsize'] = 14
    mpl.rcParams['ytick.labelsize'] = 14
    mpl.rcParams['axes.facecolor'] = 'FFFFFF'
    mpl.rcParams['axes.edgecolor'] = '000000'
    mpl.rcParams['axes.linewidth'] = 1.0
    mpl.rcParams['axes.labelweight'] = 'regular'
    mpl.rcParams['xtick.major.pad'] = 3
    mpl.rcParams['ytick.major.pad'] = 3
    plt.rcParams['font.family'] = 'sans-serif'

    print()
    print(f'Reading file: {args.i}')

    #read FASTA and populate lists for sequences and IDs
    seqs = list()
    names = list()

    #flag to strip gaps when cleaning sequence
    strip_gaps = False

    try:
        with open(args.i, 'r') as n:
            seqs, ids = modules.read_fasta(n, strip_gaps)
    except FileNotFoundError:
        print(f'Could not find file: {args.i}')
        sys.exit(1)

    #If no sequences were added to list, file was not in FASTA format
    if len(seqs) < 2:
        print('Provided file is not in FASTA format.')
        sys.exit(1)

    #Ensures sequences are aligned (all sequences have same length)
    if not modules.is_fasta_aligned(seqs):
        print('Sequences in provided FASTA are not aligned')
        sys.exit(1)

    num_seqs = len(seqs)

    #standard 20 amino acid alphabet
    res_list = list('ACDEFGHIKLMNPQRSTVWY-')

    #Matrix for residue frequencies at each position
    marginals = modules.marginal_frequencies(seqs, res_list)

    #Calculating consensus sequences for method based on user input
    consensus_positions = list()
    consensus_sequence = list()

    #Get user input for how to determine positions to include from MSA (handling gaps)
    #Users must eneter '1', '2', or '3'
    #Exits script if they do not enter valid input in five tries
    consensus_choice = args.c
    nTries = 5
    while consensus_choice not in ['1', '2', '3'] and nTries > 0:
        consensus_choice = input('\nWhich method for determining consensus positions do you want?\n***Note: Option 1 recommended.***\n1: Positions with gap frequencies < threshold (0.5 default, change with -t flag)\n2: Positions with residue as most frequent character\n3: Positions with residues in a specific sequence (you will give sequence ID)\n')
        nTries -= 1
        if nTries == 0:
            print('Invalid responses. Must enter 1, 2, or 3. Exiting program...')
            sys.exit(1)
    print()

    #Includes all positions for which gap frequency < 0.5
    #In other words, the sum of frequencies of 20 residues is > 0.5
    #This option can include positions where a gap is the most frequent occurrence
    if consensus_choice == '1':
        threshold = args.t
        if threshold < 0 or threshold > 1:
            print('Gap frequency threshold must be between 0 and 1')
            sys.exit(1)

        for i, j in enumerate(marginals):
            if j[-1] < threshold:
                max_res = np.argmax(j[:-1])
                consensus_positions.append(i)
                consensus_sequence.append(res_list[max_res])

    #Includes all positions for which a residue is the most frequent ocurrence
    #In other words, eliminates positions for which the most frequent occurrence is a gap
    elif consensus_choice == '2':
        for i, j in enumerate(marginals):
            max_res = np.argmax(j)
            if max_res != len(j) - 1:
                consensus_positions.append(i)
                consensus_sequence.append(res_list[max_res])

    #Includes all positions occupied by residues in a user-defined reference sequence
    else:
        #Take user input for sequence ID of reference sequence
        ref_seq = input('What is the ID of your reference sequence? ')

        if len(ref_seq) == 0:
            print('\nNo sequence ID was given...')
            sys.exit(1)

        #Users may not give sequence ID with leading '>' character
        #IDs from FASTA all have leading '>' character, so must add to reference ID
        if ref_seq[0] != '>':
            ref_seq = ''.join(['>', ref_seq])

        try:
            ref_index = ids.index(ref_seq)
        except ValueError:
            print('\nCould not find sequence ID in set. Check your MSA for the correct ID...')
            sys.exit(1)

        for i, (j, k) in enumerate(zip(seqs[ref_index], marginals)):
            if j != '-':
                consensus_positions.append(i)
                max_res = np.argmax(k[:-1])
                consensus_sequence.append(res_list[max_res])

    consensus_sequence = ''.join(consensus_sequence)
    print(f'Consensus sequence: {consensus_sequence}')
    print()

    #Removes positions with high gap frequencies from all sequences in alignment
    #Calculates residue frequencies for gap stripped alignment
    seqs_gap_stripped = list()
    for i in seqs:
        seqs_gap_stripped.append(''.join([i[j] for j in consensus_positions]))
    marginals_gap_stripped = modules.marginal_frequencies(seqs_gap_stripped, res_list)

    #Calculates sequence entropies for all poisitions in the gap stripped alignment
    seq_entropies = modules.seq_entropy(marginals_gap_stripped)

    #Creates matrix of residue frequencies at all positions in gap stripped alignment
    #in a format for exporting as a CSV
    out_marginals = []
    for i, j in enumerate(marginals_gap_stripped):
        if i == 0:
            out_marginals.append([''] + res_list)
            out_marginals.append([f'Position {i+1}'] + list(j))
        else:
            out_marginals.append([f'Position {i+1}'] + list(j))

    #Calculating gap frequencies
    #Gap is last element in each row of marginal frequencies
    gap_frequencies = [i[-1] for i in marginals_gap_stripped]

    #Setting path for file outputs
    #If the user does not supply a name for the output, use input file name
    #If user used a file from the Inputs directory, must remove 'Inputs/' from path
    out_file_name = args.o
    if out_file_name == 'out.txt':
        out_file_prefix = os.path.splitext(args.i)[0]
        if 'Inputs' in out_file_prefix:
            out_file_prefix = out_file_prefix.split('/')[1]
    else:
        out_file_prefix = out_file_name.split('.')[0]

    #Save figures if -f flag is not given
    if not args.f:
        #Save figure of alignment sequence entropies
        fig, ax = plt.subplots()
        ax.stem(np.arange(1, len(seq_entropies) + 1), seq_entropies, use_line_collection = True)
        ax.set_xlabel('Sequence position')
        ax.set_ylabel('Sequence entropy (bits)')
        ax.set_ylim(0, 4.5)
        fig.savefig(f'{path}Outputs/{out_file_prefix}_sequenceEntropies.png', bbox_inches = 'tight', dpi = 300)
        plt.close()

        #Save figure of gap stripped alignment gap frequencies
        fig, ax = plt.subplots()
        ax.stem(np.arange(1, len(gap_frequencies) + 1), gap_frequencies, use_line_collection = True)
        ax.set_xlabel('Sequence position')
        ax.set_ylabel('Gap frequency')
        ax.set_ylim(0, 1)
        fig.savefig(f'{path}Outputs/{out_file_prefix}_gapFrequencies.png', bbox_inches = 'tight', dpi = 300)
        plt.close()

    #Create a FASTA of the gap stripped alignment
    out_fasta = modules.make_fasta(ids, seqs_gap_stripped)

    #Save files
    with open(f'{path}Outputs/{out_file_prefix}_residueFrequencies.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(out_marginals)

    #Write summary of analysis to file
    with open(f'{path}Outputs/{out_file_prefix}_consensus_output.txt', 'w', newline='') as f:
        f.write(f'Determined consensus sequence for: {args.i}\n\n')
        f.write('Parameters used:\n')
        if consensus_choice ==  1:
            f.write(f'Method for removing insertions: {consensus_choice}\n')
            f.write(f'Gap frequency threshold: {threshold}\n\n')
        else:
            f.write(f'Method for removing insertions: {consensus_choice}\n\n')
        f.write(f'>Consensus_sequence\n{consensus_sequence}\n\n')
        f.write(f'MSA sequence entropy per residue: {np.mean(seq_entropies):.3f}\n\n')
        f.write(f'Wrote gap stripped alignment to: {out_file_prefix}_gapStrip.txt\n\n')
        f.write(f'Wrote CSV of residue frequencies to: {out_file_prefix}_residueFrequencies.csv\n\n')
        if args.f:
            f.write(f'Wrote plot of sequence entropies to: {out_file_prefix}_sequenceEntropies.png\n\n')
            f.write(f'Wrote plot of gap frequencies to: {out_file_prefix}_gapFrequencies.png\n\n')

    with open(f'{path}Outputs/{out_file_prefix}_gapStrip.txt', 'w') as f:
        f.writelines(out_fasta)

    print(f'Wrote summary of output to: Outputs/{out_file_prefix}_consensus_output.txt')