基因序列相似度分析

看到"突变距离",有没有想到一个经典的 DP 问题?没错,就是编辑距离(Levenshtein Distance)——替换、插入、删除,刚好对应编辑距离的三种操作。

思路

这题的逻辑分三层:

  1. 精确匹配优先:先扫一遍数据库,如果待测片段和某个序列完全相同,直接输出,结束。
  2. 模糊匹配:没有精确匹配的话,对每个数据库序列算一次编辑距离,距离 的收集起来,按距离升序排列,距离相同按字典序排列。
  3. 无匹配:一个都没找到,输出 None

核心就在编辑距离的计算上。

编辑距离怎么算?

设两个字符串 (长度 )和 (长度 ),定义 的前 个字符变成 的前 个字符所需的最少操作数。

转移方程:

  • 如果 ,不需要操作
  • 否则:,分别对应替换、删除、插入

初始条件:

空间优化:二维数组可以压成一维,只保留上一行的信息,用一个变量 prev 记住左上角的值就行。

复杂度

  • 时间: 是数据库序列数, 是序列长度
  • 空间:,压缩后的一维 DP 数组

代码

#include <bits/stdc++.h>
using namespace std;

int editDist(const string& a, const string& b) {
    int m = a.size(), n = b.size();
    vector<int> dp(n + 1);
    for (int j = 0; j <= n; j++) dp[j] = j;
    for (int i = 1; i <= m; i++) {
        int prev = dp[0];
        dp[0] = i;
        for (int j = 1; j <= n; j++) {
            int tmp = dp[j];
            if (a[i-1] == b[j-1]) dp[j] = prev;
            else dp[j] = 1 + min({prev, dp[j], dp[j-1]});
            prev = tmp;
        }
    }
    return dp[n];
}

int main(){
    int K, N;
    cin >> K >> N;
    vector<string> seqs(N);
    for(int i = 0; i < N; i++) cin >> seqs[i];
    string query;
    cin >> query;

    for(auto& s : seqs){
        if(s == query){
            cout << s << endl;
            return 0;
        }
    }

    vector<pair<int,string>> results;
    for(auto& s : seqs){
        int d = editDist(s, query);
        if(d <= K) results.push_back({d, s});
    }

    if(results.empty()){
        cout << "None" << endl;
    } else {
        sort(results.begin(), results.end());
        for(int i = 0; i < (int)results.size(); i++){
            if(i) cout << " ";
            cout << results[i].second;
        }
        cout << endl;
    }
    return 0;
}
import java.util.*;

public class Main {
    static int editDist(String a, String b) {
        int m = a.length(), n = b.length();
        int[] dp = new int[n + 1];
        for (int j = 0; j <= n; j++) dp[j] = j;
        for (int i = 1; i <= m; i++) {
            int prev = dp[0];
            dp[0] = i;
            for (int j = 1; j <= n; j++) {
                int tmp = dp[j];
                if (a.charAt(i-1) == b.charAt(j-1)) dp[j] = prev;
                else dp[j] = 1 + Math.min(prev, Math.min(dp[j], dp[j-1]));
                prev = tmp;
            }
        }
        return dp[n];
    }

    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);
        int K = sc.nextInt();
        int N = sc.nextInt();
        String[] seqs = new String[N];
        for (int i = 0; i < N; i++) seqs[i] = sc.next();
        String query = sc.next();

        for (String s : seqs) {
            if (s.equals(query)) {
                System.out.println(s);
                return;
            }
        }

        List<int[]> idxResults = new ArrayList<>();
        for (int i = 0; i < N; i++) {
            int d = editDist(seqs[i], query);
            if (d <= K) idxResults.add(new int[]{d, i});
        }

        if (idxResults.isEmpty()) {
            System.out.println("None");
        } else {
            idxResults.sort((a, b) -> {
                if (a[0] != b[0]) return a[0] - b[0];
                return seqs[a[1]].compareTo(seqs[b[1]]);
            });
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < idxResults.size(); i++) {
                if (i > 0) sb.append(" ");
                sb.append(seqs[idxResults.get(i)[1]]);
            }
            System.out.println(sb.toString());
        }
    }
}
import sys
input = sys.stdin.readline

def edit_dist(a, b):
    m, n = len(a), len(b)
    dp = list(range(n + 1))
    for i in range(1, m + 1):
        prev = dp[0]
        dp[0] = i
        for j in range(1, n + 1):
            tmp = dp[j]
            if a[i-1] == b[j-1]:
                dp[j] = prev
            else:
                dp[j] = 1 + min(prev, dp[j], dp[j-1])
            prev = tmp
    return dp[n]

K = int(input())
N = int(input())
seqs = [input().strip() for _ in range(N)]
query = input().strip()

for s in seqs:
    if s == query:
        print(s)
        exit()

results = []
for s in seqs:
    d = edit_dist(s, query)
    if d <= K:
        results.append((d, s))

if not results:
    print("None")
else:
    results.sort()
    print(" ".join(r[1] for r in results))
const readline = require('readline');
const rl = readline.createInterface({ input: process.stdin });
const lines = [];
rl.on('line', line => lines.push(line.trim()));
rl.on('close', () => {
    let idx = 0;
    const K = parseInt(lines[idx++]);
    const N = parseInt(lines[idx++]);
    const seqs = [];
    for (let i = 0; i < N; i++) seqs.push(lines[idx++]);
    const query = lines[idx++];

    for (const s of seqs) {
        if (s === query) {
            console.log(s);
            return;
        }
    }

    function editDist(a, b) {
        const m = a.length, n = b.length;
        const dp = new Array(n + 1);
        for (let j = 0; j <= n; j++) dp[j] = j;
        for (let i = 1; i <= m; i++) {
            let prev = dp[0];
            dp[0] = i;
            for (let j = 1; j <= n; j++) {
                const tmp = dp[j];
                if (a[i-1] === b[j-1]) dp[j] = prev;
                else dp[j] = 1 + Math.min(prev, dp[j], dp[j-1]);
                prev = tmp;
            }
        }
        return dp[n];
    }

    const results = [];
    for (const s of seqs) {
        const d = editDist(s, query);
        if (d <= K) results.push([d, s]);
    }

    if (results.length === 0) {
        console.log("None");
    } else {
        results.sort((a, b) => a[0] !== b[0] ? a[0] - b[0] : a[1].localeCompare(b[1]));
        console.log(results.map(r => r[1]).join(" "));
    }
});