基因序列相似度分析
看到"突变距离",有没有想到一个经典的 DP 问题?没错,就是编辑距离(Levenshtein Distance)——替换、插入、删除,刚好对应编辑距离的三种操作。
思路
这题的逻辑分三层:
- 精确匹配优先:先扫一遍数据库,如果待测片段和某个序列完全相同,直接输出,结束。
- 模糊匹配:没有精确匹配的话,对每个数据库序列算一次编辑距离,距离
的收集起来,按距离升序排列,距离相同按字典序排列。
- 无匹配:一个都没找到,输出
None。
核心就在编辑距离的计算上。
编辑距离怎么算?
设两个字符串 (长度
)和
(长度
),定义
为
的前
个字符变成
的前
个字符所需的最少操作数。
转移方程:
- 如果
:
,不需要操作
- 否则:
,分别对应替换、删除、插入
初始条件:,
。
空间优化:二维数组可以压成一维,只保留上一行的信息,用一个变量 prev 记住左上角的值就行。
复杂度
- 时间:
,
是数据库序列数,
、
是序列长度
- 空间:
,压缩后的一维 DP 数组
代码
#include <bits/stdc++.h>
using namespace std;
int editDist(const string& a, const string& b) {
int m = a.size(), n = b.size();
vector<int> dp(n + 1);
for (int j = 0; j <= n; j++) dp[j] = j;
for (int i = 1; i <= m; i++) {
int prev = dp[0];
dp[0] = i;
for (int j = 1; j <= n; j++) {
int tmp = dp[j];
if (a[i-1] == b[j-1]) dp[j] = prev;
else dp[j] = 1 + min({prev, dp[j], dp[j-1]});
prev = tmp;
}
}
return dp[n];
}
int main(){
int K, N;
cin >> K >> N;
vector<string> seqs(N);
for(int i = 0; i < N; i++) cin >> seqs[i];
string query;
cin >> query;
for(auto& s : seqs){
if(s == query){
cout << s << endl;
return 0;
}
}
vector<pair<int,string>> results;
for(auto& s : seqs){
int d = editDist(s, query);
if(d <= K) results.push_back({d, s});
}
if(results.empty()){
cout << "None" << endl;
} else {
sort(results.begin(), results.end());
for(int i = 0; i < (int)results.size(); i++){
if(i) cout << " ";
cout << results[i].second;
}
cout << endl;
}
return 0;
}
import java.util.*;
public class Main {
static int editDist(String a, String b) {
int m = a.length(), n = b.length();
int[] dp = new int[n + 1];
for (int j = 0; j <= n; j++) dp[j] = j;
for (int i = 1; i <= m; i++) {
int prev = dp[0];
dp[0] = i;
for (int j = 1; j <= n; j++) {
int tmp = dp[j];
if (a.charAt(i-1) == b.charAt(j-1)) dp[j] = prev;
else dp[j] = 1 + Math.min(prev, Math.min(dp[j], dp[j-1]));
prev = tmp;
}
}
return dp[n];
}
public static void main(String[] args) {
Scanner sc = new Scanner(System.in);
int K = sc.nextInt();
int N = sc.nextInt();
String[] seqs = new String[N];
for (int i = 0; i < N; i++) seqs[i] = sc.next();
String query = sc.next();
for (String s : seqs) {
if (s.equals(query)) {
System.out.println(s);
return;
}
}
List<int[]> idxResults = new ArrayList<>();
for (int i = 0; i < N; i++) {
int d = editDist(seqs[i], query);
if (d <= K) idxResults.add(new int[]{d, i});
}
if (idxResults.isEmpty()) {
System.out.println("None");
} else {
idxResults.sort((a, b) -> {
if (a[0] != b[0]) return a[0] - b[0];
return seqs[a[1]].compareTo(seqs[b[1]]);
});
StringBuilder sb = new StringBuilder();
for (int i = 0; i < idxResults.size(); i++) {
if (i > 0) sb.append(" ");
sb.append(seqs[idxResults.get(i)[1]]);
}
System.out.println(sb.toString());
}
}
}
import sys
input = sys.stdin.readline
def edit_dist(a, b):
m, n = len(a), len(b)
dp = list(range(n + 1))
for i in range(1, m + 1):
prev = dp[0]
dp[0] = i
for j in range(1, n + 1):
tmp = dp[j]
if a[i-1] == b[j-1]:
dp[j] = prev
else:
dp[j] = 1 + min(prev, dp[j], dp[j-1])
prev = tmp
return dp[n]
K = int(input())
N = int(input())
seqs = [input().strip() for _ in range(N)]
query = input().strip()
for s in seqs:
if s == query:
print(s)
exit()
results = []
for s in seqs:
d = edit_dist(s, query)
if d <= K:
results.append((d, s))
if not results:
print("None")
else:
results.sort()
print(" ".join(r[1] for r in results))
const readline = require('readline');
const rl = readline.createInterface({ input: process.stdin });
const lines = [];
rl.on('line', line => lines.push(line.trim()));
rl.on('close', () => {
let idx = 0;
const K = parseInt(lines[idx++]);
const N = parseInt(lines[idx++]);
const seqs = [];
for (let i = 0; i < N; i++) seqs.push(lines[idx++]);
const query = lines[idx++];
for (const s of seqs) {
if (s === query) {
console.log(s);
return;
}
}
function editDist(a, b) {
const m = a.length, n = b.length;
const dp = new Array(n + 1);
for (let j = 0; j <= n; j++) dp[j] = j;
for (let i = 1; i <= m; i++) {
let prev = dp[0];
dp[0] = i;
for (let j = 1; j <= n; j++) {
const tmp = dp[j];
if (a[i-1] === b[j-1]) dp[j] = prev;
else dp[j] = 1 + Math.min(prev, dp[j], dp[j-1]);
prev = tmp;
}
}
return dp[n];
}
const results = [];
for (const s of seqs) {
const d = editDist(s, query);
if (d <= K) results.push([d, s]);
}
if (results.length === 0) {
console.log("None");
} else {
results.sort((a, b) => a[0] !== b[0] ? a[0] - b[0] : a[1].localeCompare(b[1]));
console.log(results.map(r => r[1]).join(" "));
}
});

京公网安备 11010502036488号