GloVe(全局词向量嵌入)
目录
GloVe简介
1.使用预训练的GloVe的词向量(英文文本的用的最多)¶
2.自己训练Glove词向量
3. 知识点
GloVe简介
GloVe的全称叫Global Vectors for Word Representation,它是一个基于全局词频统计(count-based & overall statistics)的词表征(word representation)工具, 是斯坦福大学在2014年提出的模型. 论文地址: https://aclanthology.org/D14-1162.pdf
1.使用预训练的GloVe的词向量(英文文本的用的最多)¶
#导包
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
#构建glove预训练词向量文件
#必须是绝对路径
# r 表示对表示其他含义的\进行处理,不用\\
glove_file = datapath(r'E:/ALOT/10_deep_learning/data/glove.6B.100d.txt')
#创建一个空的临时文件
# word2vec_glove_file = get_tmpfile('glove.6B.100d.txt')
worde2vec_glove_file = get_tmpfile('glove.6B.100d.word2vec.txt')
#将glove文件转化为临时文件
glove2word2vec(glove_file, worde2vec_glove_file)
#输出一个词向量的大小 (400000, 100) 400000个词向量,每个词向量是100维度的
[('microsoft', 0.7449405193328857),('ibm', 0.6821643114089966),('intel', 0.6778088212013245),('software', 0.6775422692298889),('dell', 0.6741442680358887),('pc', 0.6678153276443481),('macintosh', 0.6617538332939148),('iphone', 0.6595612168312073),('ipod', 0.6534676551818848),('hewlett', 0.6516579389572144)]
model = KeyedVectors.load_word2vec_format(worde2vec_glove_file) #加载转化成word2vec的glove文件model.most_similar('apple'相似度较高的词) #查找与'apple'相似度较高的词model.most_similar(positive=['woman', 'king'], negative=['man'])
#表示 'king'与['man']是相对应的,按照此对应方式找'woman'最相似的词
[('queen', 0.7698540687561035),('monarch', 0.6843381524085999),('throne', 0.6755736470222473),('daughter', 0.6594556570053101),('princess', 0.6520534157752991),('prince', 0.6517034769058228),('elizabeth', 0.6464517712593079),('mother', 0.631171703338623),('emperor', 0.6106470823287964),('wife', 0.6098655462265015)]
2.自己训练Glove词向量
#在环境中安装glove包
!pip install glove-python-binary -i https://pypi.tuna.tsinghua.edu.cn/simple!pip install pkuseg -i https://pypi.tuna.tsinghua.edu.cn/simple #百度研发的分词库
#导包
import gensim
from glove import Glove
from glove import Corpus
import pkuseg
#分词
#传的是相对路径
#nthread=5 同时开多少个进程去分词
pkuseg.test(r'../data/不要等到毕业以后.txt', r'../data/不要等到毕业以后_分词.txt', nthread=5)#准备数据集
with open(r'../data/不要等到毕业以后_分词.txt', 'r', encoding='utf-8') as f:#line.strip()是去掉每一行首尾的空格sentences = [line.replace('\n', '').split(' ') for line in f.readlines() if line.strip() != ''] #sentences是二维列表sentences
#输出太长,部分展示:
[['迄今', '最', '实用', '的', '大学生', '人生', '规划', '工具书', '。'],['这是', '一', '本', '用心', '打磨', '的', '书', '。'],['完全', '不同于', '那些', '东拼西凑', '的', '大学生', '人生', '指南', ';'],['没有', '理论', '的', '说教', ',', '只有', '苦口婆心', '地', '规劝', '。'],['57','条','人生','成长','建议','、','57','个',]
#创建语料库模型
corpus_model = Corpus() #创建语料库的示例对象
#构建共现矩阵
corpus_model.fit(sentences, window=10)
corpus_model.save('../data/corpus_model') #保存训练好的语料库模型,以后可以读取加载corpus_model.dictionary #查看词与索引的对应关系
输出部分展示:
{'迄今': 0,'最': 1,'实用': 2,'的': 3,'大学生': 4,'人生': 5,'规划': 6,'工具书': 7,'。': 8,'这是': 9,}
corpus_model.matrix.nnz #查看共现矩阵中一共有多少个词
16184
#训练
#no_components:构建的词向量的维度是多少,经验值300维度以下
glove = Glove(no_components=100, learning_rate=0.05)
#no_threads=1 同时开的进程
#verbose=True显示训练过程中的日志
glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True)
Performing 10 training epochs with 1 threads Epoch 0 Epoch 1 Epoch 2 Epoch 3 Epoch 4 Epoch 5 Epoch 6 Epoch 7 Epoch 8 Epoch 9
glove.add_dictionary(corpus_model.dictionary)glove.dictionary
输出部分展示:
{'迄今': 0,'最': 1,'实用': 2,'的': 3,'大学生': 4,'人生': 5,'规划': 6,'工具书': 7,'。': 8,'这是': 9,'一': 10,}
#查看某个词的词向量
glove.word_vectors[glove.dictionary['我']]
array([-3.85271677e-03, 6.03527930e-03, -6.46135204e-03, 7.04933438e-04,-3.00269815e-04, -6.35298652e-03, -5.04293920e-03, -2.87305933e-03,9.96562326e-06, -1.28239511e-03, 2.32297241e-03, 1.67665561e-03,-4.88860444e-03, 2.11428079e-03, 4.59453142e-04, -9.32589063e-03,-4.38411281e-03, -2.58539354e-03, -1.21059775e-02, -3.58457042e-03,4.42692637e-04, 4.25907889e-03, 8.23455533e-04, -7.69576469e-03,4.53250350e-04, 1.43330507e-03, -1.93765326e-03, -5.62252827e-03,-6.46001688e-04, 1.00060915e-03, -6.56430222e-03, -9.70502832e-03,4.73688765e-03, -8.92992657e-04, 3.56808700e-03, 9.79184843e-03,9.15090144e-03, 4.93799065e-04, -5.08780746e-03, -2.17941323e-04,2.21646527e-04, 3.65010548e-03, 4.65875282e-03, -3.11105523e-03,3.47489254e-03, 1.28445053e-03, -7.86476014e-03, -3.82036168e-03,-4.68850099e-03, -2.73481768e-03, 6.43765283e-03, 3.74767271e-04,-3.61486712e-03, -2.67486806e-03, 2.20565042e-03, -7.80749700e-04,-6.51855200e-03, -1.89578758e-03, 1.97327613e-03, -5.57302319e-03,-2.51269656e-03, 1.28111552e-03, 3.78035822e-06, -7.56995098e-03,3.88149824e-03, -2.34932018e-03, -5.40425079e-03, -1.80393452e-03,5.03049188e-03, -5.58385635e-03, 3.37829026e-03, -2.29710432e-03,9.08578290e-03, -4.35930405e-03, 9.61504496e-03, 1.15320991e-02,3.32939316e-03, 9.24636042e-03, -9.36855207e-03, 1.15861179e-02,-5.50878460e-03, 8.79297798e-03, -3.17951379e-04, 4.44059952e-03,-2.52683441e-03, -3.06109383e-03, 7.93721916e-04, -2.56597966e-03,-5.98975245e-05, -2.69277017e-03, 8.00875023e-03, -8.21644481e-03,1.72800689e-03, 1.52961102e-03, -1.60708166e-04, 4.23559648e-03,-1.08741576e-03, -5.77238705e-03, 8.43079576e-03, 3.88278545e-03])
#查找相似词
glove.most_similar('专业',number=10)
[('的', 0.8430016342292636),(',', 0.8206669513798159),('是', 0.6985497214327246),('。', 0.6971518842587385),('我', 0.6554453225214771),('你', 0.6365204850038154),('不', 0.5187983190436662),('自己', 0.49869570695139864),('一', 0.47569447514853663)]
#查看全局词向量的形状
glove.word_vectors.shape
(939, 100)
glove.word_vectors
array([[ 2.22664080e-03, -4.40492727e-04, 1.56658813e-03, ...,-3.70435501e-03, 2.13948809e-03, -3.67479285e-03],[-2.59109652e-03, 4.25853419e-03, -2.38113383e-03, ...,2.06839387e-03, -4.34270506e-03, 2.74836134e-03],[-5.90079017e-04, 4.30749793e-03, 3.15428115e-03, ...,1.83901467e-03, -2.14947886e-03, 6.05205719e-05],...,[ 3.74231450e-04, 2.05554199e-03, 1.57123457e-03, ...,-2.67770537e-05, 2.63345084e-03, 1.49179696e-03],[-8.56595044e-04, -4.88608984e-03, -1.88634857e-03, ...,6.74149560e-04, 3.00811616e-03, -4.58626075e-03],[-3.12888567e-03, 8.80138770e-05, -4.06701493e-03, ...,1.17048113e-03, -2.72183947e-04, 2.05452380e-03]])
#查看共线矩阵(若语料库很大,共现矩阵也会比较大)
corpus_model.matrix.todense().shape
(939, 939)
3. 知识点
GloVe的全称叫Global Vectors for Word Representation,它是一个基于全局词频统计(count-based & overall statistics)的词表征(word representation)工具, 是斯坦福大学在2014年提出的模型. 论文地址: https://aclanthology.org/D14-1162.pdf