题解 | #DBSCAN聚类#

题目链接

DBSCAN聚类

题目描述

本题要求在二维或三维实数坐标系中，使用 DBSCAN (Density-Based Spatial Clustering of Applications with Noise) 算法进行聚类分析。

核心定义:

邻域半径 (eps): 定义了“邻近”的范围。
最小样本数 (min_samples): 判断一个区域是否为“密集”的阈值。
核心点: 如果一个点的 eps 半径邻域内包含的样本数（含自身）不少于 min_samples，则该点为核心点。
簇: 从一个未访问过的核心点出发，通过邻域可达关系（密度可达）能够扩展到的所有点的集合，形成一个簇。
噪声点: 未被任何簇吸收的点。

输入:

第一行包含三个值: eps, min_samples, 和点的总数 N。
接下来 N 行，每行是点的坐标（2个或3个实数）。

输出:

一行，包含两个整数：找到的簇的数量和噪声点的数量。

解题思路

DBSCAN 是一种基于密度的聚类算法，它能发现任意形状的簇，并能很好地识别出噪声点。其核心思想是：一个簇就是一片密度相连的点的集合。

数据结构与初始化
- 用一个列表或数组存储所有的点，每个点包含其坐标。
- 创建一个状态数组 cluster_ids，长度与点的数量相同，用于标记每个点的状态。可以约定：0 代表未访问，-1 代表噪声，正整数 k 代表属于第 k 个簇。
- 为了避免在距离计算时频繁开方，我们可以预先计算 eps 的平方 eps_squared，后续直接比较距离的平方。
核心算法流程
- 初始化簇数量 cluster_count = 0。
- 遍历每一个点 p（从索引 0 到 N-1）： a. 跳过已分类点: 如果点 p 已经被访问过（即 cluster_ids[p] 不为 0），则直接跳到下一个点。 b. 寻找邻居: 找到 p 在 eps 半径内的所有邻居点（包括 p 自身）。这个过程称为 regionQuery。 c. 判断是否为核心点: - 如果 p 的邻居数量小于 min_samples，说明它不是一个核心点。暂时将其标记为噪声点（cluster_ids[p] = -1）。注意，这个噪声点后续可能会被其他簇吸收，成为一个“边界点”。 - 如果 p 的邻居数量 大于等于 min_samples，说明它是一个核心点，可以形成一个新的簇。 d. 扩展簇 (Expand Cluster): - 将簇数量 cluster_count 加 1。 - 创建一个队列（或栈），用于存放待处理的邻居点。将 p 的所有邻居都加入这个队列。 - 将核心点 p 自身归入新的簇：cluster_ids[p] = cluster_count。 - 处理队列: 当队列不为空时，循环执行： i. 从队列中取出一个点 q。 ii. 如果 q 之前被标记为噪声，现在将它归入当前簇（cluster_ids[q] = cluster_count）。 iii.如果 q 尚未被访问（cluster_ids[q] == 0），则将它归入当前簇，并寻找 q 的所有邻居。 iv. 如果 q 本身也是一个核心点（其邻居数 >= min_samples），则将它所有未被分类的邻居都加入到队列中，等待进一步扩展。
统计结果
- 遍历完所有点后，cluster_count 就是最终的簇数量。
- 再次遍历 cluster_ids 数组，统计值为 -1 的点的数量，即为噪声点数。

代码

cpp
java
python

#include <iostream>
#include <vector>
#include <cmath>
#include <string>
#include <sstream>
#include <queue>

using namespace std;

// 计算两点之间距离的平方
double dist_sq(const vector<double>& p1, const vector<double>& p2) {
    double d_sq = 0.0;
    for (size_t i = 0; i < p1.size(); ++i) {
        d_sq += (p1[i] - p2[i]) * (p1[i] - p2[i]);
    }
    return d_sq;
}

// 寻找给定点的所有邻居
vector<int> region_query(int p_idx, const vector<vector<double>>& points, double eps_sq) {
    vector<int> neighbors;
    for (int i = 0; i < points.size(); ++i) {
        if (dist_sq(points[p_idx], points[i]) <= eps_sq) {
            neighbors.push_back(i);
        }
    }
    return neighbors;
}

int main() {
    ios::sync_with_stdio(false);
    cin.tie(NULL);

    double eps;
    int min_samples, n;
    cin >> eps >> min_samples >> n;
    cin.ignore(); 

    vector<vector<double>> points(n);
    int dim = 0;
    for (int i = 0; i < n; ++i) {
        string line;
        getline(cin, line);
        stringstream ss(line);
        double coord;
        while (ss >> coord) {
            points[i].push_back(coord);
        }
        if (i == 0) {
            dim = points[i].size();
        }
    }

    double eps_sq = eps * eps;
    vector<int> cluster_ids(n, 0); // 0: 未访问, -1: 噪声, >0: 簇ID
    int cluster_count = 0;

    for (int i = 0; i < n; ++i) {
        if (cluster_ids[i] != 0) continue; // 已访问

        vector<int> neighbors = region_query(i, points, eps_sq);

        if (neighbors.size() < min_samples) {
            cluster_ids[i] = -1; // 标记为噪声
        } else {
            cluster_count++;
            queue<int> q;
            for (int neighbor_idx : neighbors) {
                q.push(neighbor_idx);
            }
            cluster_ids[i] = cluster_count;

            while (!q.empty()) {
                int current_p = q.front();
                q.pop();

                if (cluster_ids[current_p] == -1) { // 从噪声点变成边界点
                    cluster_ids[current_p] = cluster_count;
                }
                
                if (cluster_ids[current_p] != 0) continue; // 已被分类

                cluster_ids[current_p] = cluster_count;
                vector<int> current_neighbors = region_query(current_p, points, eps_sq);
                if (current_neighbors.size() >= min_samples) {
                    for (int neighbor_idx : current_neighbors) {
                        if (cluster_ids[neighbor_idx] == 0 || cluster_ids[neighbor_idx] == -1) {
                            q.push(neighbor_idx);
                        }
                    }
                }
            }
        }
    }

    int noise_count = 0;
    for (int id : cluster_ids) {
        if (id == -1) {
            noise_count++;
        }
    }

    cout << cluster_count << " " << noise_count << endl;

    return 0;
}

import java.util.Scanner;
import java.util.ArrayList;
import java.util.List;
import java.util.Queue;
import java.util.LinkedList;

public class Main {

    private static double distSq(List<Double> p1, List<Double> p2) {
        double dSq = 0.0;
        for (int i = 0; i < p1.size(); i++) {
            double diff = p1.get(i) - p2.get(i);
            dSq += diff * diff;
        }
        return dSq;
    }

    private static List<Integer> regionQuery(int pIdx, List<List<Double>> points, double epsSq) {
        List<Integer> neighbors = new ArrayList<>();
        for (int i = 0; i < points.size(); i++) {
            if (distSq(points.get(pIdx), points.get(i)) <= epsSq) {
                neighbors.add(i);
            }
        }
        return neighbors;
    }

    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);
        double eps = sc.nextDouble();
        int minSamples = sc.nextInt();
        int n = sc.nextInt();
        sc.nextLine(); 

        List<List<Double>> points = new ArrayList<>();
        for (int i = 0; i < n; i++) {
            String[] parts = sc.nextLine().trim().split("\\s+");
            List<Double> point = new ArrayList<>();
            for (String part : parts) {
                point.add(Double.parseDouble(part));
            }
            points.add(point);
        }

        double epsSq = eps * eps;
        int[] clusterIds = new int[n]; // 0: 未访问, -1: 噪声, >0: 簇ID
        int clusterCount = 0;

        for (int i = 0; i < n; i++) {
            if (clusterIds[i] != 0) continue; // 已访问

            List<Integer> neighbors = regionQuery(i, points, epsSq);

            if (neighbors.size() < minSamples) {
                clusterIds[i] = -1; // 标记为噪声
            } else {
                clusterCount++;
                Queue<Integer> q = new LinkedList<>(neighbors);
                clusterIds[i] = clusterCount;

                while (!q.isEmpty()) {
                    int currentP = q.poll();

                    if (clusterIds[currentP] == -1) { // 从噪声点变成边界点
                        clusterIds[currentP] = clusterCount;
                    }
                    if (clusterIds[currentP] != 0) continue; // 已被分类
                    
                    clusterIds[currentP] = clusterCount;
                    List<Integer> currentNeighbors = regionQuery(currentP, points, epsSq);
                    if (currentNeighbors.size() >= minSamples) {
                        for (int neighborIdx : currentNeighbors) {
                             if (clusterIds[neighborIdx] == 0 || clusterIds[neighborIdx] == -1) {
                                q.add(neighborIdx);
                            }
                        }
                    }
                }
            }
        }

        int noiseCount = 0;
        for (int id : clusterIds) {
            if (id == -1) {
                noiseCount++;
            }
        }
        System.out.println(clusterCount + " " + noiseCount);
    }
}

from collections import deque

def region_query(p_idx, points, eps_sq):
    neighbors = []
    p1 = points[p_idx]
    for i, p2 in enumerate(points):
        dist_sq = sum([(c1 - c2) ** 2 for c1, c2 in zip(p1, p2)])
        if dist_sq <= eps_sq:
            neighbors.append(i)
    return neighbors

def main():
    # 读取参数
    line1 = input().split()
    eps = float(line1[0])
    min_samples = int(line1[1])
    n = int(line1[2])

    # 读取所有点
    points = []
    for _ in range(n):
        points.append(list(map(float, input().split())))

    eps_sq = eps * eps
    cluster_ids = [0] * n  # 0: 未访问, -1: 噪声, >0: 簇ID
    cluster_count = 0

    for i in range(n):
        if cluster_ids[i] != 0:
            continue

        neighbors = region_query(i, points, eps_sq)

        if len(neighbors) < min_samples:
            cluster_ids[i] = -1  # 标记为噪声
        else:
            cluster_count += 1
            cluster_ids[i] = cluster_count
            
            q = deque(neighbors)
            while q:
                current_p_idx = q.popleft()

                if cluster_ids[current_p_idx] == -1: # 从噪声点变成边界点
                    cluster_ids[current_p_idx] = cluster_count
                
                if cluster_ids[current_p_idx] != 0: # 已被分类
                    continue

                cluster_ids[current_p_idx] = cluster_count
                current_neighbors = region_query(current_p_idx, points, eps_sq)

                if len(current_neighbors) >= min_samples:
                    for neighbor_idx in current_neighbors:
                        if cluster_ids[neighbor_idx] == 0 or cluster_ids[neighbor_idx] == -1:
                            q.append(neighbor_idx)
    
    noise_count = sum(1 for cid in cluster_ids if cid == -1)
    
    print(f"{cluster_count} {noise_count}")

if __name__ == "__main__":
    main()

算法及复杂度

算法：DBSCAN 聚类算法
时间复杂度： $\mathcal{O}(N^2)$ ，其中 $N$ 是数据点的数量。算法的主要开销在于为每个点查找其邻居（region_query），这个操作需要遍历所有其他点，因此是 $\mathcal{O}(N)$ 。由于每个点都可能触发一次这样的查找，总的时间复杂度为 $\mathcal{O}(N^2)$ 。
空间复杂度： $\mathcal{O}(N \cdot D)$ ，其中 $D$ 是点的维度（2或3）。主要空间用于存储所有 $N$ 个点的数据，以及一个大小为 $N$ 的状态数组 cluster_ids。