#title Word2Vec [[TableOfContents]] ==== 예제 ==== {{{ # -*- coding: utf-8 -*- """ Created on Thu Feb 7 23:20:11 2019 @author: jhlee """ from gensim.models import Word2Vec import pyodbc import sys conn = pyodbc.connect(driver='{SQL Server}', host='192.168.0.1', database='gamelog', user='id', password='pw') cursor = conn.cursor() sql = """ select top 10000 msg , count(*) cnt from dbo.sentences where 1=1 and date_key = '20190207' and hh = 14 group by msg order by cnt desc """ #sql = sql.replace("@date_key", sys.argv[1]) cursor.execute(sql) row = cursor.fetchone() i = 1 sentences = [] while row: #print(row[0]) #msg sentences.append(row[0].split(" ")) i = i + 1 row = cursor.fetchone() cursor.close() #print(sentences[0]) #["''", '아', '병신'] model = Word2Vec(sentences, size=100, batch_words=10, min_count=50) model.init_sims(replace=True) model.wv.most_similar("병신") }}} ==== 모델 저장/불러오기 ==== {{{ #모델 저장/불러오기 model.save("gold_dealer_model") model = Word2Vec.load("gold_dealer_model") }}} ==== 버케블러리에 있는 것만... ==== {{{ s1 = list(filter(lambda x: x in model.wv.vocab, s1.split(" "))) s2 = list(filter(lambda x: x in model.wv.vocab, s2.split(" "))) }}} ==== 두 문자의 비교 ==== {{{ s1 = 'the first sentence' s2 = 'the second text' model.wv.wmdistance(s1, s2) }}} ==== 참고자료 ==== * https://datascienceschool.net/view-notebook/6927b0906f884a67b0da9310d3a581ee/ * http://blog.theeluwin.kr/post/146591096133/%ED%95%9C%EA%B5%AD%EC%96%B4-word2vec * https://stackoverflow.com/questions/22129943/how-to-calculate-the-sentence-similarity-using-word2vec-model-of-gensim-with-pyt/22130100