# this is also called 'edit distance'.
# consider sequence X = (x1,x2,..,xm) and Y=(y1,y2, ..., yn);
# an alignment is a subset A belongs to {1,...,m} * {1, .., n}
# for any (i,j) and (i',j'): i != i', j!= j', i < i' -> j<j'
# cost function: the number of unmatched pairs + sum(matching values (if two chars are equal, is should be zeros else 1)).
# c(i,j) minimum cost of align x1,...,xi and y1,...,yj, we need to find c(m,n)
# case 1: match the last , c(i,j) = c(i-1, j-1) + matching_values(i, j)
# case 2: leaving xi unmmached, c(i,j) = c(i-1, j) + 1
# case 3: leavinf yj unmanched, c(i,j) = c(i, j-1) + 1
# c(i,j) = min(case1. case2. case3)
# base case
# c(0,j) = j
# c(i,0) = i
# when compute c(m,n) back track from (m,n) to (0,0) and record the path
def alignment_value(char1, char2):
if char1 == char2:
return 0
else:
return 1.5 # better choose a number which can be distinct to 1
def sequence_align(s1, s2):
"""
s1, s2 are strings
"""
m = len(s1)
n = len(s2)
cost = [[0 for _ in range(n+1)] for _ in range(m+1)]
# intialize values
for i in range(n+1):
cost[0][i] = i
for i in range(m+1):
cost[i][0] = i
# compute cost
for r in range(1, m+1):
for c in range(1, n+1):
cost[r][c] = min([cost[r-1][c-1] + alignment_value(s1[r-1], s2[c-1]), cost[r-1][c] + 1, cost[r][c-1] + 1])
# output s1,s2
output_s1 = []
output_s2 = []
cost_pos = (m,n)
while cost_pos != (0,0):
r,c = cost_pos
min_val = min([cost[r][c-1], cost[r-1][c], cost[r-1][c-1]])
if min_val == cost[r-1][c-1]:
output_s1.append(s1[r-1])
output_s2.append(s2[c-1])
cost_pos = (r-1,c-1)
elif min_val == cost[r-1][c]:
output_s1.append(s1[r-1])
output_s2.append("-")
cost_pos = (r-1, c)
elif min_val == cost[r][c-1]:
output_s1.append("-")
output_s2.append(s2[c-1])
cost_pos = (r, c-1)
output_s1.reverse()
output_s2.reverse()
return ("".join(output_s1), "".join(output_s2))
if __name__ == "__main__":
s1 = "snowy"
s2 = "sunny"
new_s1, new_s2 = sequence_align(s1,s2)
print(new_s1 + "\n" + new_s2)
# s-nowy
# sun-ny