1
+ #note to Viewer: this is just a small collection of functions specific for bioinformatic genomic usage.
2
+ #they are fun and help intertwine computer science and genetics!
3
+ #hope you have fun!
4
+
5
+
6
+ def hamming_distance (seq1 , seq2 ):
7
+ '''
8
+ Function will determine the number of differences in genomic sequence when comparing two individual sequences.
9
+
10
+ Arg: Two separate sequences (str)
11
+ Output: Number of differences (int)
12
+
13
+ Example: hamming_distance("ATCTGGT","ATGAAGT")
14
+ --> 3
15
+ '''
16
+
17
+ tempo_result = zip (seq1 , seq2 )
18
+ seq_counter = 0
19
+
20
+ for i , j in tempo_result :
21
+ if i != j :
22
+ seq_counter += 1
23
+
24
+ return seq_counter
25
+
26
+
27
+
28
+ def seq_encode (seq ):
29
+ '''
30
+ Function to encode a seqence of genomic characters.
31
+
32
+ Arg: Sequence of interest (str)
33
+ Output: Encoded version of sequence (str)
34
+
35
+ Example: seq_encode('CTTTCCCAAAG$$AAT')
36
+ --> 'C3T3C3AG2$2AT'
37
+
38
+ '''
39
+ #initialize an empty list.
40
+ encoded_lst = []
41
+ count = 0
42
+
43
+ #we need to start at the beginning of the string.
44
+ for index , char in enumerate (seq ):
45
+ # print(char)
46
+ # print(index)
47
+ if index == 0 : #if we're at the beginning of the string.
48
+ current_char = seq [index ] #set the current character to euqal whatever value is in that character.
49
+ count = 1
50
+ # print(current_char)
51
+ if index != 0 : #else if the character is NOT the first character, we need to check if it's the same as the previous char.
52
+ current_char = seq [index ]
53
+ previous_char = seq [index - 1 ]
54
+ if current_char == previous_char : #if true continue to count.
55
+ count += 1
56
+ # print(index)
57
+ # print(current_char)
58
+ # encoded_list.append(current_char)
59
+ if current_char != previous_char : #if the next character is NOT the same character as the previous one.
60
+ #we need to stop the loop and append to our encoded_list.
61
+ #we don't want to include the number 1.
62
+ if count == 1 :
63
+ encoded_lst .append (previous_char )
64
+ else :
65
+ encoded_lst .append (str (count ) + previous_char )
66
+ count = 1
67
+ if index == len (seq ) - 1 : #this is to include the very last character in the string.
68
+ #we don't want to include the number 1 here too. So need to exclude it.
69
+ if count == 1 :
70
+ encoded_lst .append (current_char )
71
+ else :
72
+ encoded_lst .append (str (count ) + current_char )
73
+
74
+ encoded_result = "" .join (encoded_lst )
75
+
76
+ # return type(encoded_string)
77
+ return encoded_result
78
+
79
+ def seq_decode (seq ):
80
+ '''
81
+ Function to decode a sequence of genomic characters.
82
+
83
+ Arg: Sequence of interest (str)
84
+ Output: Decoded version of sequence (str)
85
+
86
+ Example: seq_encode('C3T3C3AG2$2AT')
87
+ --> 'CTTTCCCAAAG$$AAT'
88
+ '''
89
+
90
+ decode_lst = [] #tempo holding list
91
+ skip_char = False #don't want to skip the first char
92
+
93
+ for index , char in enumerate (seq ):
94
+
95
+ if skip_char == True :
96
+ skip_char = False #change back to False
97
+ continue #skip the present char thru continue
98
+
99
+ if skip_char == False :
100
+ if char .isnumeric ():
101
+ decode_lst .append (int (char )* seq [index + 1 ]) #this char is an integer
102
+ skip_char = True #skip next char
103
+ elif char .isalpha ():
104
+ decode_lst .append (char ) #just appending a str
105
+ skip_char = False
106
+ else : #including all symbols
107
+ decode_lst .append (char )
108
+ skip_char = False
109
+
110
+ decoded_result = "" .join (decode_lst ) #combine all str values into one str
111
+ return decoded_result
112
+
113
+
114
+
115
+
116
+
117
+
0 commit comments