forked from Ada-Activities/Encode-Decode-Genetic-Sequences
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpart_one.py
More file actions
91 lines (70 loc) · 2.56 KB
/
part_one.py
File metadata and controls
91 lines (70 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def driver():
print("-------------------------")
print("Categorizing sequences...")
print("-------------------------")
all_sequences = [
"GGGGGAAAGGCCCCTTTAAAACCCCTTTTTAAAACCCCCGGGAAAATTTTAAA",
"GGGGGAAAUUCCCCTTTAAAACCCCUUUUUAAAACCCCCGGGAAAATTTTAAA",
"CCCAAAAATTTTCCCCGGGTTAAAATTTTTGGGGGAAACCCGGGGAAAACCCCC",
"CCCAAAAAGGGGCCCCCGGGGAAAACCCCGGGGGAAACCCGGGGAAAACCCCC"
]
categorized_sequences = {}
categorized_sequences["undetermined"] = [] # strands that can't be determined
categorized_sequences["dna"] = [] # dna strands
categorized_sequences["rna"] = [] # rna strands
for sequence in all_sequences:
category = categorize_strand(sequence)
categorized_sequences[category].append(sequence)
print("-------------------------")
print("Encoding sequences for storage...")
print("-------------------------")
encoded_sequences = []
for sequence in all_sequences:
encoded_strand = encode_strand(sequence)
encoded_sequences.append(encoded_strand)
print("-------------------------")
print("Listing undetermined sequences for review...")
print("-------------------------")
for sequence in categorized_sequences[-1]:
print(sequence)
# Returns 0 for DNA (Contains "T" bases)
# Returns 1 for RNA (Contains "U" bases)
# Returns -1 if the strand cannot be categorized:
# - Contains both "T" and "U" in the same strand
# - There are no "T" or "U" bases in the strand
def categorize_strand(strand):
is_t_present = False
is_u_present = False
for base in strand:
if base == "T":
is_t_present = True
if base == "U":
is_u_present = True
has_both_bases = (is_t_present and is_u_present)
has_neither_base = (not is_t_present and not is_u_present)
if (has_both_bases or has_neither_base):
return -1
return 0 if is_t_present else 1
def encode_strand(strand):
if not strand:
return ""
encoding = []
count = 1
for index in range(1, len(strand)):
if strand[index - 1] == strand[index]:
count += 1
else:
new_entry = strand[index - 1] + count
encoding.append(new_entry)
count = 1
return "".join(encoding)
def decode_strand(encoding):
if not encoding:
return ""
strand = []
for index in range(0, len(encoding) - 1, 2):
letter = encoding[index]
count = int(encoding[index + 1])
next_base = [letter] * count
strand.extend(next_base)
return "".join(strand)