#title Word2Vec
[[TableOfContents]]

==== 예제 ====
{{{
# -*- coding: utf-8 -*-
"""
Created on Thu Feb  7 23:20:11 2019

@author: jhlee
"""

from gensim.models import Word2Vec
import pyodbc
import sys

conn = pyodbc.connect(driver='{SQL Server}', host='192.168.0.1', database='gamelog', user='id', password='pw')
cursor = conn.cursor()

sql = """
    select top 10000
        msg
    ,   count(*) cnt
    from dbo.sentences
    where 1=1
    and date_key = '20190207'
    and hh = 14
    group by
        msg
    order by cnt desc
"""
#sql = sql.replace("@date_key", sys.argv[1])

cursor.execute(sql) 
row = cursor.fetchone() 
i = 1
sentences = []
while row: 
    #print(row[0]) #msg  
    sentences.append(row[0].split(" "))
    i = i + 1
    row = cursor.fetchone()
    
cursor.close()

#print(sentences[0])
#["''", '아', '병신']
model = Word2Vec(sentences, size=100, batch_words=10, min_count=50)
model.init_sims(replace=True)
model.wv.most_similar("병신")
}}}

==== 모델 저장/불러오기 ====
{{{
#모델 저장/불러오기
model.save("gold_dealer_model")
model = Word2Vec.load("gold_dealer_model")
}}}

==== 버케블러리에 있는 것만... ====
{{{
s1 = list(filter(lambda x: x in model.wv.vocab, s1.split(" ")))
s2 = list(filter(lambda x: x in model.wv.vocab, s2.split(" ")))
}}}

==== 두 문자의 비교 ====
{{{
s1 = 'the first sentence'
s2 = 'the second text'

model.wv.wmdistance(s1, s2)
}}}


==== 참고자료 ====
 * https://datascienceschool.net/view-notebook/6927b0906f884a67b0da9310d3a581ee/
 * http://blog.theeluwin.kr/post/146591096133/%ED%95%9C%EA%B5%AD%EC%96%B4-word2vec
 * https://stackoverflow.com/questions/22129943/how-to-calculate-the-sentence-similarity-using-word2vec-model-of-gensim-with-pyt/22130100