首先可以想到暴力吧。

每一轮回到 nn 显然需要跳 ngcd(i,n)\dfrac{n}{\gcd(i,n)} 次。

那么不妨直接暴力。这样是调和级数级别的复杂度。

但是 gcd\gcd 在大部分情况下很小,所以这个调和级数会被卡成 n2n^2 级别。

那么想到记忆化。由于答案是 ngcd(i,n)×i=1nai, i0 (mod gcd(i,n))\dfrac{n}{\gcd(i,n)} \times \min\limits_{i=1}^n a_i, \space i \equiv 0 \space (\bmod \space \gcd(i,n)),也就是答案基于 gcd(i,n)\gcd(i,n)。对于 gcd\gcd 相同的部分直接记下来即可。

但是还不够,此时的代码需要大约 1.2s1.2s 才能跑完。由于复杂度瓶颈在于 gcd\gcd 和调和级数枚举部分,而调和级数枚举无法用朴素方法在低于调和级数的复杂度内跑过,所以考虑优化 gcd\gcd 部分。

gcd\gcd 复杂度是 log\log 级别的。优化可以通过 gcd(i,n)=gcd(ni,n)\gcd(i,n)=\gcd(n-i,n) 来记录一半,另一半访问数组即可。

实测 n=107n=10^7 数据下 [0.7s,0.8s][0.7s,0.8s] 内可以跑过。

#pragma GCC optimize(2)
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fhoist-adjacent-loads")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#include <bits/stdc++.h>
using ll = long long;
using namespace std;

const int Mod = 1004535809, N = 1e7 + 10;
int n, a[N], Gc[N]; unsigned seed, mod, Min[6];
ll res, ans[N];

inline unsigned read() {
  seed ^= seed << 13; 
  seed ^= seed >> 5; 
  seed ^= seed << 7; 
  return seed % mod + 1; 
}

int main() {
  scanf("%d%u%u", &n, &seed, &mod);
  for (int i = 1; i <= n; ++i) a[i] = read();
  int mid = (n + 1) / 2;
  for (int i = 1; i <= mid; ++i) {
    int gc = __gcd(i, n); Gc[i] = gc; ll turn = n / gc, minn = 2e9;
    if (ans[gc]) { res += ans[gc]; if (res > Mod) res -= Mod; continue; }
    for (int j = gc; j <= n; j += gc) minn = minn > a[j]? a[j] : minn;
    ans[gc] = turn * minn % Mod;
    res += ans[gc]; if (res >= Mod) res -= Mod;
  }
  Gc[0] = n;
  for (int i = mid + 1; i <= n; ++i) {
    int gc = Gc[n - i];
    ll turn = n / gc, minn = 2e9;
    if (ans[gc]) { res += ans[gc]; if (res > Mod) res -= Mod; continue; }
    for (int j = gc; j <= n; j += gc) minn = minn > a[j]? a[j] : minn;
    ans[gc] = turn * minn % Mod;
    res += ans[gc]; if (res >= Mod) res -= Mod;
  }
  printf("%lld\n", res);
  return 0;
}

upd. 这里的 gcd\gcd 换成 binary gcd\text{binary gcd} 可以更暴力地草过。

#include <bits/stdc++.h>
using ll = long long;
using namespace std;

const int Mod = 1004535809, N = 1e7 + 10;
int n, a[N]; unsigned seed, mod, Min[6];
ll res, ans[N];

inline unsigned read() {
  seed ^= seed << 13; 
  seed ^= seed >> 5; 
  seed ^= seed << 7; 
  return seed % mod + 1; 
}

inline int gcd(int a, int b) {
  int az = __builtin_ctz(a);
  int bz = __builtin_ctz(b);
  int z = min(az, bz); b >>= bz;
  while (a) {
    a >>= az; int diff = a - b;
    az = __builtin_ctz(diff);
    b = min(a, b), a = abs(diff);
  }
  return b << z;
}

int main() {
  scanf("%d%u%u", &n, &seed, &mod);
  for (int i = 1; i <= n; ++i) a[i] = read();
  for (int i = 1; i <= n; ++i) {
    int gc = gcd(i, n); ll turn = n / gc, minn = 2e9;
    if (ans[gc]) { res += ans[gc]; if (res > Mod) res -= Mod; continue; }
    for (int j = gc; j <= n; j += gc) minn = minn > a[j]? a[j] : minn;
    ans[gc] = turn * minn % Mod;
    res += ans[gc]; if (res >= Mod) res -= Mod;
  }
  printf("%lld\n", res);
  return 0;
}