Skip to content

Commit 53a2118

Browse files
authored
Added new project "DNA Sequence Analysis"
Added new project
1 parent 2ce4465 commit 53a2118

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

DNA_sequence_analysis/README.md

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#note to Viewer: this is just a small collection of functions specific for bioinformatic genomic usage.
2+
#they are fun and help intertwine computer science and genetics!
3+
#hope you have fun!
4+
5+
6+
def hamming_distance(seq1, seq2):
7+
'''
8+
Function will determine the number of differences in genomic sequence when comparing two individual sequences.
9+
10+
Arg: Two separate sequences (str)
11+
Output: Number of differences (int)
12+
13+
Example: hamming_distance("ATCTGGT","ATGAAGT")
14+
--> 3
15+
'''
16+
17+
tempo_result = zip(seq1, seq2)
18+
seq_counter = 0
19+
20+
for i, j in tempo_result:
21+
if i != j:
22+
seq_counter += 1
23+
24+
return seq_counter
25+
26+
27+
28+
def seq_encode(seq):
29+
'''
30+
Function to encode a seqence of genomic characters.
31+
32+
Arg: Sequence of interest (str)
33+
Output: Encoded version of sequence (str)
34+
35+
Example: seq_encode('CTTTCCCAAAG$$AAT')
36+
--> 'C3T3C3AG2$2AT'
37+
38+
'''
39+
#initialize an empty list.
40+
encoded_lst = []
41+
count = 0
42+
43+
#we need to start at the beginning of the string.
44+
for index, char in enumerate(seq):
45+
# print(char)
46+
# print(index)
47+
if index == 0: #if we're at the beginning of the string.
48+
current_char = seq[index] #set the current character to euqal whatever value is in that character.
49+
count = 1
50+
# print(current_char)
51+
if index != 0: #else if the character is NOT the first character, we need to check if it's the same as the previous char.
52+
current_char = seq[index]
53+
previous_char = seq[index - 1]
54+
if current_char == previous_char: #if true continue to count.
55+
count += 1
56+
# print(index)
57+
# print(current_char)
58+
# encoded_list.append(current_char)
59+
if current_char != previous_char: #if the next character is NOT the same character as the previous one.
60+
#we need to stop the loop and append to our encoded_list.
61+
#we don't want to include the number 1.
62+
if count == 1:
63+
encoded_lst.append(previous_char)
64+
else:
65+
encoded_lst.append(str(count) + previous_char)
66+
count = 1
67+
if index == len(seq) - 1: #this is to include the very last character in the string.
68+
#we don't want to include the number 1 here too. So need to exclude it.
69+
if count == 1:
70+
encoded_lst.append(current_char)
71+
else:
72+
encoded_lst.append(str(count) + current_char)
73+
74+
encoded_result = "".join(encoded_lst)
75+
76+
# return type(encoded_string)
77+
return encoded_result
78+
79+
def seq_decode(seq):
80+
'''
81+
Function to decode a sequence of genomic characters.
82+
83+
Arg: Sequence of interest (str)
84+
Output: Decoded version of sequence (str)
85+
86+
Example: seq_encode('C3T3C3AG2$2AT')
87+
--> 'CTTTCCCAAAG$$AAT'
88+
'''
89+
90+
decode_lst = [] #tempo holding list
91+
skip_char = False #don't want to skip the first char
92+
93+
for index, char in enumerate(seq):
94+
95+
if skip_char == True:
96+
skip_char = False #change back to False
97+
continue #skip the present char thru continue
98+
99+
if skip_char == False:
100+
if char.isnumeric():
101+
decode_lst.append(int(char)*seq[index + 1]) #this char is an integer
102+
skip_char = True #skip next char
103+
elif char.isalpha():
104+
decode_lst.append(char) #just appending a str
105+
skip_char = False
106+
else: #including all symbols
107+
decode_lst.append(char)
108+
skip_char = False
109+
110+
decoded_result = "".join(decode_lst) #combine all str values into one str
111+
return decoded_result
112+
113+
114+
115+
116+
117+

0 commit comments

Comments
 (0)