赞
踩
- >>> # import modules & set up logging
- >>> import gensim, logging
- >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- >>>
- >>> sentences = [['first', 'sentence'], ['second', 'sentence']]
- >>> # train word2vec on the two sentences
- >>> model = gensim.models.Word2Vec(sentences, min_count=1)
- >>> class MySentences(object):
- ... def __init__(self, dirname):
- ... self.dirname = dirname
- ...
- ... def __iter__(self):
- ... for fname in os.listdir(self.dirname):
- ... for line in open(os.path.join(self.dirname, fname)):
- ... yield line.split()
- >>>
- >>> sentences = MySentences('/some/directory') # a memory-friendly iterator
- >>> model = gensim.models.Word2Vec(sentences)
- >>> model = gensim.models.Word2Vec() # an empty model, no training
- >>> model.build_vocab(some_sentences) # can be a non-repeatable, 1-pass generator
- >>> model.train(other_sentences) # can be a non-repeatable, 1-pass generator
>>> model = Word2Vec(sentences, min_count=10) # default value is 5
>>> model = Word2Vec(sentences, size=200) # default value is 100
>>> model = Word2Vec(sentences, workers=4) # default = 1 worker = no parallelization
- >>> model.accuracy('/tmp/questions-words.txt')
- -02-01 22:14:28,387 : INFO : family: 88.9% (304/342)
- -02-01 22:29:24,006 : INFO : gram1-adjective-to-adverb: 32.4% (263/812)
- -02-01 22:36:26,528 : INFO : gram2-opposite: 50.3% (191/380)
- -02-01 23:00:52,406 : INFO : gram3-comparative: 91.7% (1222/1332)
- -02-01 23:13:48,243 : INFO : gram4-superlative: 87.9% (617/702)
- -02-01 23:29:52,268 : INFO : gram5-present-participle: 79.4% (691/870)
- -02-01 23:57:04,965 : INFO : gram7-past-tense: 67.1% (995/1482)
- -02-02 00:15:18,525 : INFO : gram8-plural: 89.6% (889/992)
- -02-02 00:28:18,140 : INFO : gram9-plural-verbs: 68.7% (482/702)
- -02-02 00:28:18,140 : INFO : total: 74.3% (5654/7614)
- >>> model.save('/tmp/mymodel')
- >>> new_model = gensim.models.Word2Vec.load('/tmp/mymodel')
- >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
- >>> # using gzipped/bz2 input works too, no need to unzip:
- >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)
- >>> model = gensim.models.Word2Vec.load('/tmp/mymodel')
- >>> model.train(more_sentences)
- >>> model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
- [('queen', 0.50882536)]
- >>> model.doesnt_match("breakfast cereal dinner lunch".split())
- 'cereal'
- >>> model.similarity('woman', 'man')
- .73723527
- >>> model['computer'] # raw NumPy vector of a word
- array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。