python实现Hyperlink-Induced Topic Search算法
Hyperlink-Induced Topic Search是一个网页重要性的分析的算法。用户输入关键词后,算法对返回的匹配页面计算两种值,一种是枢纽值(Hub Scores),另一种是权威值(Authority Scores),这两种值是互相依存、互相影响的。所谓枢纽值,指的是页面上所有导出链接指向页面的权威值之和。权威值是指所有导入链接所在的页面中枢纽之和。在限定范围之后根据网页的出度和入度建立一个矩阵,通过矩阵的迭代运算和定义收敛的阈值不断对两个向量Authority和Hub值进行更新直至收敛。
def initialize_authority(pages):
#初始化权威值为1
return dict(zip(pages.keys(), [1]*len(pages)))
def clean_pages(pages):
for page in pages:
outside_links = []
for i in range(len(pages[page])):
if pages[page][i] not in pages or pages[page][i] == page: outside_links.append(i)
outside_links.reverse()
for outside_link in outside_links:
pages[page].pop(outside_link)
return pages
def initialize_L_matrices(pages):
L_matrix = pages
Lt_matrix = {}
for page in pages:
Lt_matrix[page] = []
for page in pages:
for link in pages[page]:
Lt_matrix[link].append(page)
return L_matrix, Lt_matrix
def multiply_matrix_vector(matrix, vector):
result_matrix = {}
for row in matrix:
result_matrix[row] = 0
for item in matrix[row]:
result_matrix[row] += vector[item]
return result_matrix
def normalize(vector):
max = 0
for component in vector:
if vector[component] > max:
max = vector[component]
if max == 0:
return vector
for component in vector:
vector[component] = float(vector[component]) / max
return vector
def vector_difference(vector1, vector2):
if not (vector1 and vector2): return float("inf")
total = 0
for component in vector1:
total += abs(vector1[component] - vector2[component])
return total
def HITS(pages):
pages = clean_pages(pages)
authority_old = None
authority = initialize_authority(pages)
(L_matrix, Lt_matrix) = initialize_L_matrices(pages)
while vector_difference(authority_old, authority) > 0.1:
authority_old = authority
hubbiness = normalize(multiply_matrix_vector(L_matrix, authority))
authority = normalize(multiply_matrix_vector(Lt_matrix, hubbiness))
return authority, hubbiness
def main():
pages = {"a":["b", "c"], "b":["f"], "c":["b", "e"], "d":["b"], "e":["c"],"f":[]}
(authority, hubbiness) = HITS(pages)
print("Authority: " + str(authority))
print("Hubbiness: " + str(hubbiness))
if __name__ == "__main__":
main()
PS:我在本段代码选用的测试案例为:
pages = {“a”:[“b”, “c”], “b”:[“f”], “c”:[“b”, “e”], “d”:[“b”], “e”:[“c”],“f”:[]}
这个字典包含了若干网页的出度和入度的关系。
2019年5月2日