-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounting_gc_content
57 lines (34 loc) · 1.77 KB
/
counting_gc_content
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#This program intends to read fast files with different gene sequences and give the GC content of each one of them
def counting_nucleotides(sequence):#This general purpose function counts the nucleotides in a string looking for G, C, A and T
sequence = sequence.upper() #defining the dictionary and the nucleotide list for the next loop
dic_nuc = {'C' : 0, 'G' : 0 , 'A': 0, 'T' : 0}
list_nuc = ['C', 'G', 'A', 'T']
i = 0 #this loop updates our dictionary with the nucleotide counting
while i < len(list_nuc):
dic_nuc[f'{list_nuc[i]}'] = sequence.count(f'{list_nuc[i]}')
i += 1
return dic_nuc
def comp_gc_content(sequence_counting): #this function receives a counting nucleotide dictionary
gc_content = (sequence_counting['C'] + sequence_counting['G'])/(sequence_counting['C'] + sequence_counting['G'] + sequence_counting['T'] + sequence_counting['A'])
return gc_content
dir_pathway = input('Type the file pathway. If it is already on this program file, just type <filename.txt>: ') #asking the user for the .txt file pathway
txt = open(f"{dir_pathway}", 'r')
txt_file = txt.read()
txt_strip = txt_file.split('>')
txt_strip = txt_strip[1:]
txt_list = [] #here the script splits the first \n (after que FASTA id), splitting these 2 informations
i = 0
for el in range (len(txt_strip)):
txt_list.append((txt_strip[i].split('\n',1)))
i += 1
print(txt_list)
id_gc_content = [] #put it inside a loop
i = 0
for ls in txt_list: #in this part the script puts side by side, in a list, the FASTA ID followed by the gc content.
dic_nuc = counting_nucleotides(txt_list[i][1])
print(dic_nuc)
id_gc_content.append(txt_list[i][0])
id_gc_content.append(round(comp_gc_content(dic_nuc)*100,4))
i += 1
print(id_gc_content)
txt.close()