import csv
import nltk
import gensim
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pylab as plt
import os, time
from frovedis.decomposition import LatentDirichletAllocation as frovLatentDirichletAllocation 
from frovedis.exrpc.server import FrovedisServer
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

/home/user01/.local/lib/python3.6/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)


                                                                data = pd.read_table('../../data/datasets/data-8.csv', sep=',', engine='python')
bow = []
for i in range(len(data['bow'])):
    bow.append(data['bow'][i].strip('{}').split(','))
print("data set size: ", len(bow))
print(bow[1])

data set size:  43090
['"u.s. treasury secretary"', '"janet yellen"', 'express', 'concern', 'house', 'affordability', 'thursday', 'expect', 'several', 'month', 'high', 'inflation', 'read', 'remain', 'transitory', 'add', 'fed', 'good', 'job', 'chair', '"jerome powell"', '"janet yellen"', 'interview', 'certain', 'amazon', 'meet', 'profitability', 'threshold', 'inclusion', 'new', 'global', 'reallocation', 'tax', 'right', 'agree', 'country', 'oecd', 'pillar', 'tax', 'deal', 'allow', 'country', 'large', 'highly', 'profitable', 'multinationals', 'sell', 'goods', 'service', 'tax', 'portion', 'profit', 'arrangement', 'replace', 'national', 'digital', 'service', 'tax', 'company', 'amazon', 'google', 'facebook', '"janet yellen"', 'run-up', 'u.s.', 'house', 'price', 'low', '"interest rate"', 'strong', 'demand', 'carry', 'risk', 'associate', 'house', 'crash', 'lend', 'creditworthy', 'borrower', 'different', 'phenomenon', 'worry', 'affordability', 'pressure', 'higher', 'house', 'price', 'create', 'family', 'first', 'time', 'homebuyers', '"janet yellen"', 'add', 'congress', 'consider', 'plan', '"joe biden"', 'administration', 'boost', 'supply', 'affordable', 'house', 'regard', 'elevate', 'inflation', 'data', 'view', 'transitory', 'think', 'several', 'month', 'rapid', 'inflation', 'one-month', 'phenomenon', 'think', 'medium', 'term', 'inflation', 'decline', 'back', 'normal', 'level', 'course', 'keep', 'careful', 'eye', 'regard', 'performance', 'fed', '"jerome powell"', 'lot', 'respect', 'fed', 'important', 'independent', 'judgment', 'appropriate', 'think', 'fed', 'good', 'job']


                                                                from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
data = []
for i in range(len(bow)):
    txt = [i.strip('"') for i in bow[i]]
    maped_list = map(str, txt)
    data.append(' '.join(maped_list))
print(data[1])

vectorizer = CountVectorizer(min_df=0.05, max_df=0.8)
vec = vectorizer.fit(data)
X = vec.transform(data)

u.s. treasury secretary janet yellen express concern house affordability thursday expect several month high inflation read remain transitory add fed good job chair jerome powell janet yellen interview certain amazon meet profitability threshold inclusion new global reallocation tax right agree country oecd pillar tax deal allow country large highly profitable multinationals sell goods service tax portion profit arrangement replace national digital service tax company amazon google facebook janet yellen run-up u.s. house price low interest rate strong demand carry risk associate house crash lend creditworthy borrower different phenomenon worry affordability pressure higher house price create family first time homebuyers janet yellen add congress consider plan joe biden administration boost supply affordable house regard elevate inflation data view transitory think several month rapid inflation one-month phenomenon think medium term inflation decline back normal level course keep careful eye regard performance fed jerome powell lot respect fed important independent judgment appropriate think fed good job


                                                                start = 1
limit = 30
step = 1

FrovedisServer.initialize("mpirun -np 8 {}".format(os.environ['FROVEDIS_SERVER']))

score_vals = []
perplexity_vals = []

for n_topic in tqdm(range(start, limit, step)):
    frov_lda = frovLatentDirichletAllocation(n_components=n_topic, max_iter=50, random_state=0)
    frov_lda.fit_transform(X)
    perplexity_vals.append(frov_lda.perplexity(X))
    score_vals.append(frov_lda.score(X))

x = range(start, limit, step)
fig, ax1 = plt.subplots(figsize=(12,5))

# coherence
c1 = 'darkturquoise'
ax1.plot(x, score_vals, 'o-', color=c1)
ax1.set_xlabel('Num Topics')
ax1.set_ylabel('Score', color=c1); ax1.tick_params('y', colors=c1)

# perplexity
c2 = 'slategray'
ax2 = ax1.twinx()
ax2.plot(x, perplexity_vals, 'o-', color=c2)
ax2.set_ylabel('Perplexity', color=c2); ax2.tick_params('y', colors=c2)

# Vis
ax1.set_xticks(x)
fig.tight_layout()
plt.show()

100%|██████████| 29/29 [10:14<00:00, 21.18s/it]


                                                                num_topics=15
frov_lda = frovLatentDirichletAllocation(n_components=num_topics, max_iter=100)
t1 = time.time()
frov_lda.fit(X)
t2 = time.time()
print ("train time: {:.3f} sec".format(t2-t1))
feature_names = vec.get_feature_names()
sorted = np.argsort(frov_lda.components_, axis=1)[:, ::-1]
num_words = 10
sorted_head = sorted[:,0:num_words]
for i in range(num_topics):
    to_print = []
    for j in range(num_words):
        to_print.append(feature_names[sorted_head[i,j]])
    print ("topic {0}: {1}".format(i, to_print))
FrovedisServer.shut_down()

train time: 3.699 sec
topic 0: ['year', 'quarter', 'company', 'bank', 'share', 'analyst', 'earn', 'revenue', 'expect', 'sachs']
topic 1: ['covid', '19', 'economy', 'country', 'government', 'economic', 'pandemic', 'vaccine', 'year', 'recovery']
topic 2: ['year', 'month', 'growth', 'quarter', 'economy', 'rise', 'data', 'show', 'increase', 'last']
topic 3: ['stock', 'share', '500', 'rise', 'gain', 'nasdaq', 'index', 'market', 'investor', 'high']
topic 4: ['market', 'stock', 'investor', 'trade', 'year', 'think', 'time', '500', 'high', 'look']
topic 5: ['donald', 'trump', 'joe', 'president', 'biden', 'tax', 'house', 'election', 'state', 'congress']
topic 6: ['dollar', 'market', 'currency', 'year', 'price', 'month', 'rise', 'investor', 'week', 'bank']
topic 7: ['china', 'trade', 'tariff', 'donald', 'trump', 'deal', 'president', 'talk', 'country', 'conflict']
topic 8: ['china', 'trade', 'index', 'dollar', 'close', 'stock', 'japan', 'market', 'rise', 'gain']
topic 9: ['fed', 'rate', 'policy', 'inflation', 'bank', 'interest', 'market', 'economy', 'central', 'cut']
topic 10: ['yield', 'year', 'bond', 'treasury', '10', 'point', 'market', 'rise', 'basis', 'investor']
topic 11: ['job', 'week', 'unemployment', 'month', 'report', 'covid', '19', 'home', 'worker', 'data']
topic 12: ['oil', 'price', 'gold', 'barrel', 'market', 'crude', 'week', 'future', 'demand', 'rise']
topic 13: ['bank', 'financial', 'loan', 'debt', 'fund', 'government', 'company', 'credit', 'finance', 'new']
topic 14: ['company', 'share', 'year', 'investor', 'fund', 'firm', 'investment', 'deal', 'market', 'include']


                                                                from sklearn.decomposition import LatentDirichletAllocation as skLatentDirichletAllocation
num_topics=15
sk_lda = skLatentDirichletAllocation(n_components=num_topics, learning_method='online')
t1 = time.time()
sk_lda.fit(X)
t2 = time.time()
print ("train time: {:.3f} sec".format(t2-t1))
feature_names = vec.get_feature_names()
sorted = np.argsort(sk_lda.components_, axis=1)[:, ::-1]
num_words = 10
sorted_head = sorted[:,0:num_words]
for i in range(num_topics):
    to_print = []
    for j in range(num_words):
        to_print.append(feature_names[sorted_head[i,j]])
    print ("topic {0}: {1}".format(i, to_print))

train time: 280.096 sec
topic 0: ['month', 'year', 'data', 'rise', 'economist', 'show', 'economy', 'growth', 'increase', 'last']
topic 1: ['china', 'trade', 'global', 'country', 'world', 'economy', 'beijing', 'import', 'year', 'hong']
topic 2: ['share', 'company', 'stock', 'deal', 'trade', 'sell', 'price', 'investor', 'sale', 'corp']
topic 3: ['donald', 'trump', 'tariff', 'president', 'deal', 'trade', 'talk', 'conflict', 'house', 'administration']
topic 4: ['company', 'fund', 'year', 'investment', 'firm', 'business', 'new', 'include', 'investor', 'capital']
topic 5: ['covid', '19', 'euro', 'european', 'germany', 'vaccine', 'europe', 'recovery', 'economic', 'economy']
topic 6: ['dollar', 'index', 'gold', 'currency', 'trade', 'rise', 'japan', 'market', 'gain', 'yen']
topic 7: ['week', '500', 'stock', 'gain', 'point', 'dow', 'index', 'rise', 'nasdaq', 'new']
topic 8: ['oil', 'price', 'barrel', 'crude', 'energy', 'demand', 'supply', 'production', 'cut', 'market']
topic 9: ['market', 'stock', 'investor', 'trade', 'year', 'think', 'time', 'high', 'price', 'look']
topic 10: ['government', 'covid', '19', 'economy', 'state', 'people', 'country', 'economic', 'pandemic', 'minister']
topic 11: ['yield', 'year', 'bond', 'treasury', '10', 'point', 'basis', 'market', 'rise', 'ecb']
topic 12: ['quarter', 'year', 'earn', 'analyst', 'company', 'growth', 'share', 'expect', 'profit', 'revenue']
topic 13: ['fed', 'rate', 'policy', 'inflation', 'interest', 'economy', 'bank', 'cut', 'central', 'market']
topic 14: ['bank', 'financial', 'loan', 'debt', 'credit', 'fund', 'central', 'market', 'finance', 'government']


                                                                bow = [[w for w in p if len(w) >= 2] for p in bow]
dictionary = gensim.corpora.Dictionary(bow)
dictionary.filter_extremes(no_below=2154, no_above=0.8)
corpus = [dictionary.doc2bow(t) for t in bow]


                                                                start = 1
limit = 20
step = 1

coherence_vals = []
perplexity_vals = []

for n_topic in tqdm(range(start, limit, step)):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topic, random_state=0)
    perplexity_vals.append(np.exp(-lda_model.log_perplexity(corpus)))
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=bow, dictionary=dictionary, coherence='c_v')
    coherence_vals.append(coherence_model_lda.get_coherence())

# evaluation
x = range(start, limit, step)

fig, ax1 = plt.subplots(figsize=(12,5))

# coherence
c1 = 'darkturquoise'
ax1.plot(x, coherence_vals, 'o-', color=c1)
ax1.set_xlabel('Num Topics')
ax1.set_ylabel('Coherence', color=c1); ax1.tick_params('y', colors=c1)

# perplexity
c2 = 'slategray'
ax2 = ax1.twinx()
ax2.plot(x, perplexity_vals, 'o-', color=c2)
ax2.set_ylabel('Perplexity', color=c2); ax2.tick_params('y', colors=c2)

# Vis
ax1.set_xticks(x)
fig.tight_layout()
plt.show()

100%|██████████| 19/19 [40:28<00:00, 127.81s/it]


                                                                num_topics = 15
t1 = time.time()
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=num_topics,
                                            random_state=0)
t2 = time.time()
print ("train time: {:.3f} sec".format(t2-t1))
for i, t in enumerate(range(lda_model.num_topics)):
    x = dict(lda_model.show_topic(t, 25))
    print(i, x)
    print("---------------------------------------------------\n")

train time: 39.739 sec
0 {'market': 0.053761404, 'investor': 0.020111788, 'think': 0.019904515, 'stock': 0.01399406, 'time': 0.013099753, 'trade': 0.012302028, '"stock market"': 0.010837575, 'volatility': 0.010285967, 'year': 0.009513458, 'week': 0.0090554925, 'move': 0.008801158, 'price': 0.008786209, 'risk': 0.0080689965, 'look': 0.008063251, 'get': 0.007882137, 'bull': 0.0077432073, 'back': 0.007491272, 'strategist': 0.007457439, 'high': 0.00731959, 'bear': 0.0070969174, 'thing': 0.007023131, 'buy': 0.006838382, 'lot': 0.0067028683, 'day': 0.006606779, 'global': 0.0065658214}
---------------------------------------------------

1 {'bank': 0.0332958, 'u.k.': 0.023078755, 'financial': 0.021158777, 'trade': 0.021098945, 'rule': 0.013586895, 'system': 0.012128644, 'use': 0.011286877, 'market': 0.010891605, 'e.u.': 0.010865556, 'exchange': 0.010202443, 'new': 0.008975153, 'service': 0.008384848, 'industry': 0.0081956, 'regulator': 0.008161285, 'firm': 0.008157888, 'account': 0.007937537, 'customer': 0.00792115, 'risk': 0.0077864947, 'company': 0.007593266, 'change': 0.0075272596, 'currency': 0.0073114745, 'year': 0.0072970986, 'london': 0.006672484, 'foreign': 0.0066102888, 'need': 0.006431516}
---------------------------------------------------

2 {'bank': 0.08479011, 'debt': 0.028715108, 'loan': 0.02755938, 'central': 0.017022781, 'ecb': 0.014515732, 'year': 0.014286342, 'credit': 0.013988381, 'lend': 0.0132767055, 'government': 0.012527516, 'economy': 0.011464633, 'crisis': 0.009694113, 'business': 0.009334032, 'borrow': 0.009135348, 'country': 0.008785087, 'cut': 0.008076081, 'lender': 0.007915095, 'liquidity': 0.0077396547, 'finance': 0.007636113, 'need': 0.0074988, 'financial': 0.007477624, 'risk': 0.0074698636, '"interest rate"': 0.0074397526, 'economic': 0.007392, 'cost': 0.006986319, 'program': 0.006041723}
---------------------------------------------------

3 {'job': 0.04852475, 'worker': 0.025104843, 'week': 0.022154752, 'state': 0.017587196, 'claim': 0.016675938, 'business': 0.016125187, 'people': 0.014973336, 'u.s.': 0.0144548025, 'american': 0.014396983, 'report': 0.012720018, 'work': 0.0121321175, 'number': 0.011850902, 'employment': 0.010923327, 'government': 0.010683176, 'economy': 0.010616091, '"job market"': 0.009408729, 'unemployment': 0.009386783, 'economist': 0.008700606, 'federal': 0.008549031, 'program': 0.008312528, 'accord': 0.008152132, 'pay': 0.008054837, 'last': 0.008007951, 'economic': 0.007964718, 'covid-19': 0.007950731}
---------------------------------------------------

4 {'u.k.': 0.055528723, 'covid-19': 0.035566095, 'pound': 0.02540956, 'case': 0.020288635, 'virus': 0.020175325, 'report': 0.017384216, 'week': 0.014090572, 'sale': 0.013920055, 'health': 0.013569776, 'people': 0.013493512, 'new': 0.012796314, 'travel': 0.012749156, 'country': 0.012481698, 'number': 0.012151906, 'data': 0.0112340525, 'spread': 0.009662341, 'vaccine': 0.008851544, 'test': 0.0087554, 'year': 0.00827715, 'home': 0.008143301, 'month': 0.008032258, 'first': 0.0074710064, 'day': 0.0074119344, 'rise': 0.0072232154, 'last': 0.007214268}
---------------------------------------------------

5 {'oil': 0.052232277, 'price': 0.021708405, 'global': 0.019859368, 'barrel': 0.019796964, 'economy': 0.019015564, 'year': 0.017599734, 'cut': 0.017313488, 'crude': 0.016382433, 'u.s.': 0.014933228, 'demand': 0.014819045, 'production': 0.014777129, 'growth': 0.0141369905, 'economic': 0.013648353, 'output': 0.0127975745, 'supply': 0.012796155, 'energy': 0.012386941, 'expect': 0.010769808, 'world': 0.0097643705, 'wti': 0.008666918, 'country': 0.0086294655, 'covid-19': 0.008356266, 'producer': 0.008294179, 'forecast': 0.008262386, 'analyst': 0.008076413, 'month': 0.0073839445}
---------------------------------------------------

6 {'month': 0.02874208, 'rise': 0.02130247, 'growth': 0.019536171, 'year': 0.019273285, 'increase': 0.018658765, 'data': 0.01783876, 'last': 0.01672811, 'price': 0.015983453, 'economist': 0.01538295, 'show': 0.014475185, 'report': 0.013654887, 'u.s.': 0.012602823, 'economy': 0.0122645525, 'consumer': 0.011563786, 'sale': 0.011108179, 'drop': 0.009647607, 'fell': 0.009629652, 'trade': 0.009242939, 'spend': 0.009086209, 'decline': 0.008757632, 'manufacture': 0.008548958, 'expect': 0.008443767, 'survey': 0.008248704, 'rate': 0.008226723, 'slow': 0.007612085}
---------------------------------------------------

7 {'"donald trump"': 0.059938867, 'president': 0.029495655, 'u.s.': 0.024258228, 'e.u.': 0.01859407, 'deal': 0.017295266, 'tax': 0.015500539, 'government': 0.014804236, 'country': 0.012671713, 'talk': 0.012271955, 'meet': 0.012095674, 'official': 0.010259799, 'tell': 0.010206899, 'administration': 0.0097215045, 'leader': 0.009295711, '"white house"': 0.008843321, 'economy': 0.008679082, 'plan': 0.008563641, 'economic': 0.008451415, 'agreement': 0.008403269, 'election': 0.0074886214, 'vote': 0.006812037, 'state': 0.0066422382, 'new': 0.0066113505, 'negotiation': 0.006344399, 'political': 0.006169018}
---------------------------------------------------

8 {'company': 0.04241937, 'fund': 0.03333932, 'year': 0.02343534, 'investor': 0.017284289, 'investment': 0.01538896, 'firm': 0.012540421, 'capital': 0.011269929, 'plan': 0.011021798, 'business': 0.010582689, 'accord': 0.01038936, 'last': 0.00978364, 'asset': 0.009516143, 'include': 0.009022967, 'executive': 0.008868594, 'deal': 0.008516654, 'manager': 0.008320341, 'bank': 0.008254319, 'share': 0.008049017, 'invest': 0.007995186, 'new': 0.0072774114, 'cash': 0.0070419107, 'group': 0.00701638, 'ceo': 0.00700605, 'source': 0.0068199094, 'sell': 0.0068164147}
---------------------------------------------------

9 {'fed': 0.08217692, 'rate': 0.023392234, 'economy': 0.02287308, 'inflation': 0.02047236, '"interest rate"': 0.019424612, 'year': 0.017932735, 'policy': 0.017657619, 'cut': 0.017183615, 'rates': 0.016024332, '"jerome powell"': 0.015602326, 'economic': 0.014687915, 'bank': 0.01433782, 'central': 0.013705585, 'growth': 0.0136747025, 'hike': 0.011526007, 'meet': 0.01059949, 'expect': 0.009963159, 'u.s.': 0.009198383, 'time': 0.008132618, 'recession': 0.007517827, '"monetary policy"': 0.007484537, 'raise': 0.007352254, 'target': 0.0067574712, 'president': 0.0065400773, 'market': 0.0064726677}
---------------------------------------------------

10 {'trade': 0.026421167, 'u.s.': 0.024788182, 'rise': 0.023491828, 'gain': 0.023439627, '"s&p 500"': 0.020100253, 'stock': 0.017806021, 'market': 0.017091433, 'week': 0.01690931, 'close': 0.01665139, 'day': 0.016186371, 'fell': 0.016129421, 'index': 0.015952837, 'point': 0.014845879, 'dow': 0.014001997, 'high': 0.013715479, 'nasdaq': 0.013211274, 'investor': 0.011494785, 'future': 0.011173804, 'share': 0.0105759725, 'dollar': 0.010406377, 'decline': 0.010281896, 'drop': 0.01016758, 'session': 0.009712288, 'low': 0.008427251, 'higher': 0.008086364}
---------------------------------------------------

11 {'stock': 0.03148051, 'year': 0.031240862, 'earn': 0.02819766, 'company': 0.021288471, '"s&p 500"': 0.019016093, 'market': 0.015426341, 'trade': 0.013133068, 'growth': 0.012483887, 'analyst': 0.012130098, 'investor': 0.011204688, 'report': 0.00991744, 'share': 0.00926051, 'expect': 0.009029505, 'sector': 0.008938341, 'high': 0.008773781, 'apple': 0.008438152, 'profit': 0.008403938, 'revenue': 0.008202103, 'estimate': 0.007703336, 'equity': 0.007397624, 'tech': 0.0073719933, 'time': 0.007289203, 'accord': 0.007282901, 'look': 0.0071443403, 'higher': 0.007142101}
---------------------------------------------------

12 {'share': 0.064796925, 'stock': 0.043202065, 'company': 0.02734056, 'trade': 0.02003781, 'china': 0.01987294, 'index': 0.017672619, 'investor': 0.01621393, 'corp': 0.0149780465, 'close': 0.014560375, 'sector': 0.012724906, 'maker': 0.012511863, 'rise': 0.0111916, 'market': 0.011030178, 'co': 0.010635624, '"hong kong"': 0.0102905, 'end': 0.010049839, 'report': 0.009732282, 'price': 0.0095677115, 'profit': 0.009101334, 'fell': 0.00892748, 'monday': 0.00875902, 'u.s.': 0.008522433, 'firm': 0.008231075, 'u.s.-china': 0.0071269507, 'gain': 0.0069721052}
---------------------------------------------------

13 {'yield': 0.040636495, 'dollar': 0.032767124, 'u.s.': 0.0231729, 'gold': 0.019627916, 'rise': 0.019313881, 'bond': 0.017387373, 'currency': 0.016328096, 'investor': 0.015806189, 'point': 0.015098576, 'market': 0.014023234, 'week': 0.01292649, 'trade': 0.011892651, 'basis': 0.011116506, '10-year': 0.010847266, 'euro': 0.010369865, 'fell': 0.0098264795, 'fed': 0.009556415, 'price': 0.00924608, 'global': 0.0089106625, '"interest rate"': 0.008731655, 'low': 0.008696686, 'last': 0.008131036, '"10-year treasury note"': 0.00763029, 'high': 0.007276938, 'move': 0.0069675897}
---------------------------------------------------

14 {'china': 0.09241967, 'u.s.': 0.056214854, 'tariff': 0.04330696, 'trade': 0.03524462, '"trade conflict"': 0.01735591, '"donald trump"': 0.017189667, 'deal': 0.015623232, 'u.s.-china': 0.015016183, 'company': 0.013147883, 'goods': 0.011886283, 'beijing': 0.011596753, 'import': 0.011468087, 'year': 0.010811381, 'global': 0.008086868, 'president': 0.007947163, 'talk': 0.0077083944, 'two': 0.007644566, 'world': 0.007400059, 'country': 0.0072567044, 'include': 0.0069692307, 'product': 0.0065049967, 'last': 0.006383196, 'impose': 0.006257779, 'washington': 0.006182525, 'report': 0.005856462}
---------------------------------------------------

学習アルゴリズム	Frovedis (秒)	Frovedisでの高速化
Frovedis LDA	3.70	-
Scikit-learn LDA (CPU)	280.1	75.7倍
Genism LDA (CPU)	39.74	10.7倍

BluStellar（ブルーステラ）

製品・ソリューション

業種・業務

企業情報

サイト内の現在位置

FrovedisのLDAで文書分類のための学習高速化

教師なし学習：　¶

LDA（scikit-learn版、Frovedis版での学習時間比較, 参考としてgensim版による学習）¶

使用するデータセット：　単語分割とStop words除去を実施した経済関連ニュース記事¶

データロード¶

データ加工と出現する単語カウントの特徴量化¶

ScoreとPerplexityからトピック数を決定¶

Frovedis LDAによる学習¶

scikit-learn LDAによる学習¶

gensim LDAによる学習¶

gensimでトピック数を決定¶

関連リンク

サイト内の現在位置

FrovedisのLDAで文書分類のための学習高速化

教師なし学習： ¶

LDA（scikit-learn版、Frovedis版での学習時間比較, 参考としてgensim版による学習）¶

使用するデータセット： 単語分割とStop words除去を実施した経済関連ニュース記事¶

データロード¶

データ加工と出現する単語カウントの特徴量化¶

ScoreとPerplexityからトピック数を決定¶

Frovedis LDAによる学習¶

scikit-learn LDAによる学習¶

gensim LDAによる学習¶

gensimでトピック数を決定¶

関連リンク

教師なし学習：　¶

使用するデータセット：　単語分割とStop words除去を実施した経済関連ニュース記事¶