import java.util.*; import java.io.*; // 注意类名必须为 Main, 不要有任何 package xxx 信息 public class Main { public static void main(String[] args) throws Exception{ BufferedReader bf = new BufferedReader(new InputStreamReader(System.in)); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(System.out)); int n = Integer.valueOf(bf.readLine()); String[] lines = new String[n]; for (int i = 0; i < n; i++){ lines[i] = bf.readLine(); // bw.write(line); } int k = Integer.valueOf(bf.readLine()); int p = Integer.valueOf(bf.readLine()); for (int i = 0; i < p; i++){ String[] line = bf.readLine().split(" "); int t = Math.min(Integer.valueOf(line[0]), n - 1); String[] info = Arrays.copyOfRange(line, 1, line.length); int bestdoc = calc(t, k, lines, info); bw.write(String.valueOf(bestdoc)); if (i != p - 1){ bw.write(" "); } } bw.flush(); bw.close(); } // 计算词向量 @SuppressWarnings("unchecked") private static int calc(int t, int k, String[] lines, String[] info){ int start = t - k + 1; Map<String, Integer>[] f = calcF(lines, start, t); Map<String, Integer> tf = f[0]; Map<String, Integer> df = f[1]; int index = start; Map<String, Double> embedding = new HashMap<>(); for (Map.Entry<String, Integer> entry : df.entrySet()){ String key = entry.getKey(); int value = entry.getValue(); double idfItem = Math.log((double)(k + 1) / (double)(value + 1)) + 1.0; int tfItem = tf.get(key); double tfIDF = (double) tfItem * idfItem; embedding.put(key, tfIDF); } Map<String, Double> queryEmbeding = new HashMap<>(); double queryNorm = 0.0; for (String word : info){ double tfIDF = embedding.getOrDefault(word, 0.0); queryNorm += tfIDF * tfIDF; queryEmbeding.put(word, tfIDF); } queryNorm = Math.sqrt(queryNorm); Map<String, Double>[] documentEmbedings = new HashMap[k]; Arrays.setAll(documentEmbedings, vector -> new HashMap<>()); double maxSimilarity = -1.0; int bestDoc = -1; double[] weights = new double[k]; for (int i = 1; i <= k; i++){ weights[i - 1] = i / k; } for (Map<String, Double> documentEmbeding : documentEmbedings){ String line = lines[index]; double documentNorm = 0.0; // double weight = (double)((index - start) + 1) / (double) (k); for (String word : line.split(" ")){ double tfIDF = embedding.getOrDefault(word, 0.0); documentNorm += tfIDF * tfIDF; documentEmbeding.put(word, tfIDF); } documentNorm = Math.sqrt(documentNorm); // 计算相似度 double dotProduct = 0.0; for (Map.Entry<String, Double> queryWord : queryEmbeding.entrySet()){ if (documentEmbeding.containsKey(queryWord.getKey())){ dotProduct += documentEmbeding.get(queryWord.getKey()) * queryWord.getValue(); } } double similarity = 0.0; if (queryNorm > 1e-12 && documentNorm > 1e-12) { similarity = dotProduct / (queryNorm * documentNorm); } // System.out.println("similarity:" + similarity + "\n"); // similarity *= weights[index - start]; // 更新最佳文档 if (similarity >= 0.6) { if (similarity > maxSimilarity) { maxSimilarity = similarity; bestDoc = index; } else if (Math.abs(similarity - maxSimilarity) < 1e-12) { // 如果相似度相同,选择窗口中更早的文档(即索引更小的) double temp = similarity * weights[index - start] - maxSimilarity * weights[bestDoc - start]; if (temp < 1e-12){ if (index < bestDoc){ bestDoc = index; } } else { bestDoc = index; } } } index++; } return bestDoc; } // 计算词频TF, DF @SuppressWarnings("unchecked") private static Map<String, Integer>[] calcF(String[] lines, int start, int t){ Map<String, Integer> freqTF = new HashMap<>(); Map<String, Integer> freqDF = new HashMap<>(); for (int i = start; i <= t; i++){ String line = lines[i]; String[] words = line.split(" "); Set<String> hashset = new HashSet<>(Arrays.asList(words)); for (String word : words){ freqTF.put(word, freqTF.getOrDefault(word, 0) + 1); freqDF.put(word, freqTF.getOrDefault(word, 0) + 1); } } return new Map[]{freqTF, freqDF}; } }