加载训练好的word2vec模型,求用户搜索结果的平均向量
import numpy as np
file_name = './data/train_querylist_writefile-1w.csv'
cur_model = gensim.models.Word2Vec.load('1w_word2vec_300.model')
with open(file_name, 'r') as f:
cur_index = 0
lines = f.readlines()
doc_cev = np.zeros((len(lines),300))
for line in lines:
word_vec = np.zeros((1,300))
words = line.strip().split(' ')
wrod_num = 0
for word in words:
if word in cur_model:
wrod_num += 1
word_vec += np.array([cur_model[word]])
doc_cev[cur_index] = word_vec / float(wrod_num)
cur_index += 1
doc_cev.shape
doc_cev[5]
genderlabel = np.loadtxt(open('./data/train_gender.csv', 'r')).astype(int)
genderlabel.shape
educationlabel = np.loadtxt(open('./data/train_education.csv', 'r')).astype(int)
educationlabel.shape
agelabel = np.loadtxt(open('./data/train_age.csv', 'r')).astype(int)
agelabel.shape
def removezero(x, y):
nozero = np.nonzero(y)
y = y[nozero]
x = np.array(x)
x = x[nozero]
return x, y
gender_train, genderlabel = removezero(doc_cev, genderlabel)
age_train, agelabel = removezero(doc_cev, agelabel)
education_train, educationlabel = removezero(doc_cev, educationlabel)
print (gender_train.shape,genderlabel.shape)
print (age_train.shape,agelabel.shape)
print (education_train.shape,educationlabel.shape)