字符串之hash_牛客博客

字符串有效的转化为一个整数
hash[i] = (hash[i - 1] * p + idx(s[i])) % P;
一个字符串到整数的映射，，一一对应是很完美的

模数的选取： 1610612741 805306457 402653189 201326611 等

冲突！！
调整p 和 P
p取一个素数，P一般用1e9 + 7；或者双hash

hash[l..r] = (hash[r] - hash[l - 1] * (p ^ (r - l + 1))) % P; 求s[l..r]的hash值
if(hash[l..r] <0) hash[l..r] += P;

双hash(最稳）
pair<hash1,hash2>

那些年用hash水过的算法hhh

1、kmp

问题：给两个字符串S1，S2，求S2是否是S1的子串，并求S2在S1中出现的次数

把S2 Hash出来，在S1里找所有长度为|S2|的子串，Hash比较。效率O(|S1|)

粘一个用Hash写的KMP模板题 POJ 3461

#include <iostream>
#include <cstring>
#include <cstdio>
using namespace std;

typedef unsigned long long ll;
const int base = 27;
const int maxn = 1e6+10;

char sub[maxn],str[maxn];
ll p[maxn];
ll Hash[maxn];
int T;

int main()
{
    scanf("%d",&T);
    p[0] = 1;
    for(int i = 1; i < maxn; i++)
        p[i] = p[i - 1] * base;//进位，也是反向数的，比如12354,123是1000
    while(T--)
    {
        memset(sub,0,sizeof(sub));
        memset(str,0,sizeof(str));
        scanf("%s",sub);
        scanf("%s",str);
        int L = strlen(sub);
        int n = strlen(str);
        ll H = 0;
        for(int i = L - 1; i >= 0; i--)
            H = H * base + sub[i]; //反向赋值
        Hash[n] = 0;
        for(int i = n - 1; i >= 0; i--)
            Hash[i] = Hash[i + 1] * base + str[i];//反向赋值，便于取前面字符串
        int ans = 0;
        for(int i = 0; i <= n - L; i++)
        {
            if(H == Hash[i] - Hash[i + L] * p[L])
                ans++;
        }
        printf("%d\n",ans);
    }
    return 0;
}

2、AC自动机

问题：给N个单词串，和一个文章串，求每个单词串是否是文章串的子串，并求每个单词在文章中出现的次数。

把每一个单词hash成整数，再把文章的每一个子串hash成整数，接下来只需要进行整数上的查找即可。

复杂度：O(|A|2+|S|)

用AC自动机可以做到O(|A|+|S|)的复杂度,|S|是单词串总长，|A|是文章长度

3、后缀数组

问题：给两个字符串S1,S2，求它们的最长公共子串的长度。

将S1的每一个子串都hash成一个整数，将S2的每一个子串都hash成一个整数

两堆整数，相同的配对，并且找到所表示的字符串长度最大的即可。

复杂度：O(|S1|2+|S2|2)

用后缀数组可以优化到O(|S|log|S|)

4、马拉车

问题：给一个字符串S，求S的最长回文子串。

先求子串长度位奇数的，再求偶数的。枚举回文子串的中心位置，然后二分子串的长度，直到找到一个该位置的最长回文子串，不断维护长度最大值即可。

复杂度：O(|S|log|S|)

用manacher可以做到O(|S|)的复杂度

5、扩展kmp

问题：给一个字符串S，求S的每个后缀与S的最长公共前缀

枚举每一个后缀的起始位置，二分长度，求出每个后缀与S的最长公共前缀。

复杂度：O(|S|log|S|)

用extend-kmp可以做到O(|S|)

一个自动取模的方法 hash值储存在unsigned long long里面，那样溢出时，会自动取余2的64次方，but这样可能会使2个不同串的哈希值相同，但这样的概率极低（不排除你的运气不好）。

思想

在信息学奥赛中，使用最广泛的算法叫做：BKDR Hash

它的核心思想是：

对于一个字符串，选取恰当的进制，将一个字符串看做是一个大整数（众人：***,你这是要让我们写高精啊）然后再对一个随便什么数取模就可以啦

当然这个“恰当的进制”和“随便什么数”是有讲究的

根据砖家的研究：

进制一般选择大于字符串中的最大的字符且不含模数的值因子的数

比如说，如果你是对一串小写字母做字符串hash，那么131这个进制就是不错的选择

而“随便什么数”有三种方法

选择两个模数，判断的时候只有两个hash值都相同才算相同
选择一个大质数，像11111111111111111111111或者212370440130137957
用unsigned long long 自然溢出

注意：尽量不要只用一个较小的质数，根据生日悖论，很容易被卡

洛谷 P3370 【模板】字符串哈希必做练手题

3种姿势

unsigned long long 自然溢出

#include<bits/stdc++.h>
using namespace std;

const int p = 27;
set<long long>a;
int n,len;
unsigned long long hash1;
string s;

int main()
{
    scanf("%d",&n);
    for(int i = 0;i < n; ++i)
    {
        cin>>s;
        len = s.size();
        hash1 = s[0];
        for(int j = 1;j < len; ++j)
            hash1 = hash1 * p + s[j];
        a.insert(hash1);
    }
    printf("%d\n",a.size());
    return 0;
}

双hash

#include<bits/stdc++.h>
using namespace std;

const long long P1 = 1610612741;
const long long P2 = 805306457;
const int p = 131;
set<pair<long long ,long long> >a;
int n,len;
long long hash1,hash2;
string s;

int main()
{
    scanf("%d",&n);
    for(int i = 0;i < n; ++i)
    {
        cin>>s;
        len = s.size();
        hash1 = hash2 = s[0];
        for(int j = 1;j < len; ++j)
        {
             hash1 = (hash1 * p + s[j]) % P1;
             hash2 = (hash2 * p + s[j]) % P2;
        }
        a.insert(make_pair(hash1,hash2));
    }
    printf("%d\n",a.size());
    return 0;
}

大质数hash

#include<bits/stdc++.h>
using namespace std;

const long long P = 212370440130137957;
const int p = 27;
set<long long>a;
int n,len;
long long hash1;
string s;

int main()
{
    scanf("%d",&n);
    for(int i = 0;i < n; ++i)
    {
        cin>>s;
        len = s.size();
        hash1 = s[0];
        for(int j = 1;j < len; ++j)
            hash1 = (hash1 * p + s[j]) % P;
        a.insert(hash1);
    }
    printf("%d\n",a.size());
    return 0;
}

KMP模板题

#include <bits/stdc++.h>
using namespace std;

typedef unsigned long long ull;
int base = 155;
ull po[100010], hs[100010 * 100];
int s1[100010], s2[100010 * 100];

ull solve(int l, int r)
{
    return hs[r] - hs[l - 1] * po[r - l + 1];
}

int main()
{
    po[0] = 1;
    for(int i = 1; i <= 10010; i++)
        po[i] = po[i - 1] * base;
    int t;
    scanf("%d", &t);
    while(t--)
    {
        int n, m;
        scanf("%d %d", &n, &m);
        ull a1 = 0, pos = 0;
        for(int i = 1; i <= n; i++)
        {
            scanf("%d", &s2[i]);
            hs[i] = hs[i - 1] * base + s2[i];
        }
        for(int i = 1; i <= m; i++)
        {
            scanf("%d", &s1[i]);
            a1 = a1 * base + s1[i];
        }
        for(int i = 1; i <= n; i++)
        {
            if(a1 == solve(i, i + m - 1))
            {
                pos = i;
                break;
            }
        }
        if(!pos) printf("-1\n");
        else printf("%d\n", pos);
    }
}