算法_牛客博客

案例一：基于TF-IDF的关键词提取

TF-IDF：衡量某个词对文章的重要性由TF和IDF组成

TF：词频（因素：某词在同一文章中出现次数）

IDF：反文档频率（因素：某词是否在不同文章中出现）

TF-IDF = TF*IDF

TF ：一个单词在一篇文章出现次数越多越重要

IDF: 每篇文章都出现的单词（如的，你，我，他），越不重要

案例二：相似度计算公式

（1）余弦相似度cosine

举例：A(1,2,3) B(2,3,4)

cosine(A, B) = 分子 / 分母 = 20 / 20.12 =

分子：A*B = 1*2 + 2*3 + 3*4 = 20

分母： ||A||* ||B|| = 3.74 * 5.38 = 20.12

||A|| = sqrt(A*A) = sqrt(1*1 + 2*2 + 3* 3) = 3.74

||B|| = sqrt(B*B) = sqrt(4*4 + 2*2 + 3* 3) = 5.38

A表示一个句子，B表示另外一个句子

cosine(A, B) = 表示两个句子的相似程度

（二）Jaccard 相似度：

A 用户（香蕉、苹果、鸭梨）

B 用户（苹果、橘子、桃子）

A∩B=苹果 1

A∪B=香蕉、苹果、鸭梨、橘子、桃子 5

相似度为1/5=0.2

# Step 1 文件整合把不同文本整合到一起

import os

import sys

import math

file_path_dir = './data'

raw_path = './raw.data'

idf_path = './idf.data'

def read_file_handler(f):

fd = open(f, 'r', encoding='utf-8')

return fd

file_raw_out = open(raw_path, 'w', encoding='utf-8')

# 遍历整个原始数据目录，将零散的文章整合到一个文件中，便于后续数据处理

file_name = 0

for fd in os.listdir(file_path_dir):

file_path = file_path_dir + '/' + fd

content_list = []

file_fd = read_file_handler(file_path)

for line in file_fd:

content_list.append(line.strip())

content = '\t'.join([str(file_name), ' '.join(content_list)]) + '\n'

file_raw_out.writelines(content)

file_name += 1

file_raw_out.close()

# Step 2 输出IDF

docs_cnt = file_name

wc_tulist = []

with open(raw_path, 'r', encoding='utf-8') as fd:

for line in fd:

# 遍历每一篇文章，文章=line

ss = line.strip().split('\t')

if len(ss) != 2:

continue

# 对文章的解析，区分出文章的名字和文章的内容

file_name, file_content = ss

# 对文章的内容进行切词，因为内容已经按“ ”空格区分好了，所以直接按空格做split就好

word_list = file_content.strip().split(' ')

# 去重：对于idf，只关心词有没有出现在文章中，至于出现多少次，并不关心

word_set = set(word_list)

for word in word_set:

# 对于每个关键词，打一个标记“1”，来标识该次出现过

wc_tulist.append((word, '1'))

# 将内容输出到指定目标文件中去

file_idf_out = open(idf_path, 'w', encoding='utf-8')

# 按照词的字典序，进行排序

wc_sort_tulist = sorted(wc_tulist, key=lambda x: x[0])

current_word = None

sum = 0

for tu in wc_sort_tulist:

word, val = tu

if current_word == None:

current_word = word

if current_word != word:

# 通过idf计算公式，得到每个关键词的idf score

idf = math.log(float(docs_cnt) / (float(sum) + 1.0)) idf公式

content = '\t'.join([current_word, str(idf)]) + '\n'

file_idf_out.write(content)

current_word = word

sum = 0

sum += int(val)

idf = math.log(float(docs_cnt) / (float(sum) + 1.0))

content = '\t'.join([current_word, str(idf)]) + '\n'

file_idf_out.write(content)

file_idf_out.close()

# Step 3

input_str = '我们带来阿里巴巴希望差我们我们我们'

token_idf_dict = {}

# 将idf字典加载到内存

with open(idf_path, 'r', encoding='utf-8') as fd:

for line in fd:

ss = line.strip().split('\t')

if len(ss) != 2:

continue

token, idf_score = ss

token_idf_dict[token] = idf_score

def get_tfidf(input_str):

token_dict = {}

# 对输入字符串的每一个词，计算tf

for t in input_str.strip().split(' '):

if t not in token_dict:

token_dict[t] = 1

else:

token_dict[t] += 1

# res_tu_list = []

for k, v in token_dict.items():

tf_score = token_dict[k]

if k not in token_idf_dict:

continue

idf_score = token_idf_dict[k]

tf_idf = tf_score * float(idf_score)

yield (k, tf_idf)

for k, v in get_tfidf(input_str):

print(k, v)

求cos

# Step 1: Cosine

input1_str = '我们带来阿里巴巴希望差差差'

# input2_str = '我们带来阿里巴巴好好好'

# input2_str = '我们带来搜狐好好好'

input2_str = '你们带来搜狐希望好好好'

def cosine(input1_str, input2_str):

t1_dict = {}

sum = 0.

for k, v in get_tfidf(input1_str):

sum += pow(v, 2)

sum = math.sqrt(sum)

for k, v in get_tfidf(input1_str):

t1_dict[k] = float(v / sum)

sum = 0.

for k, v in get_tfidf(input2_str):

sum += pow(v, 2)

sum = math.sqrt(sum)

final_score = 0.

for k, v in get_tfidf(input2_str):

if k not in t1_dict:

continue

s1 = t1_dict[k]

s2 = float(v / sum)

final_score += s1 * s2

return final_score

print(cosine(input1_str, input2_str))