forked from jubatus/jubatus.github.com
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtutorial.html
More file actions
728 lines (693 loc) · 45 KB
/
tutorial.html
File metadata and controls
728 lines (693 loc) · 45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>Overview and Scenario — Jubatus 0.1.0 documentation</title>
<link rel="stylesheet" href="_static/jubatus.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<script type="text/javascript">
var GuideSentence = 'keyword';
function ShowFormGuide(obj) {
if( obj.value == '' ) {
obj.value = GuideSentence;
obj.style.color = '#cccccc';
}else{
obj.style.color = '#333333';
}
}
function HideFormGuide(obj) {
obj.style.color = '#827046';
if( obj.value == GuideSentence ) {
obj.value='';
}
}
</script>
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '',
VERSION: '0.1.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<link rel="shortcut icon" href="_static/favicon.ico"/>
<link rel="top" title="Jubatus 0.1.0 documentation" href="index.html" />
<link rel="up" title="Getting Started" href="gettingstarted.html" />
<link rel="next" title="References" href="references.html" />
<link rel="prev" title="Getting Started" href="gettingstarted.html" />
<!--
_________ ___
\____ _| | / ___I_I___
| | | | \__^ ^__/ ____
| | | | ___ ___ __ | | / __/
| | |-| |-|| |/ \ / \| | | | |^| |^|| (__
| | | | | || ^ || ^ | | | | | | | \__ \
| | | \_/ || O || O | | |_| \_/ | ___) |
| | \__/|_||_|\___/ \___/|_| |__/ \__/|_| \___/
| /
/ /
|/
-->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-26408953-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
<a href="references.html" title="References"
accesskey="N">next</a> |</li>
<li class="right" >
<a href="gettingstarted.html" title="Getting Started"
accesskey="P">previous</a> |</li>
<li><a href="index.html">Jubatus 0.1.0 documentation</a> »</li>
<li><a href="gettingstarted.html" accesskey="U">Getting Started</a> »</li>
</ul>
</div>
<a href="index.html">
<div class="title"></div>
</a>
<div class="sphinxsidebar">
<div class="sphinxsidebarwrapper">
<h3><a href="index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">Overview and Scenario</a><ul>
<li><a class="reference internal" href="#prequisites">Prequisites</a></li>
</ul>
</li>
<li><a class="reference internal" href="#setup-a-single-process-jubatus-server">Setup a single process Jubatus Server</a><ul>
<li><a class="reference internal" href="#building-and-installing-jubatus">building and installing Jubatus</a></li>
<li><a class="reference internal" href="#installing-python-client-for-jubatus">installing Python client for Jubatus</a></li>
</ul>
</li>
<li><a class="reference internal" href="#sending-query-to-a-jubatus-server">Sending query to a Jubatus server</a><ul>
<li><a class="reference internal" href="#prepairing-dataset">Prepairing dataset</a></li>
<li><a class="reference internal" href="#set-configure">Set configure</a></li>
<li><a class="reference internal" href="#train-classify">Train/Classify</a></li>
</ul>
</li>
<li><a class="reference internal" href="#setup-jubatus-server-with-multiple-processes">Setup Jubatus Server with multiple processes</a><ul>
<li><a class="reference internal" href="#setup-zookeeper">Setup ZooKeeper</a></li>
<li><a class="reference internal" href="#jubakeeper">jubakeeper</a></li>
<li><a class="reference internal" href="#running-two-processes-as-one-classifier-instance">Running two processes as one classifier instance</a></li>
</ul>
</li>
<li><a class="reference internal" href="#setup-jubatus-in-cluster">Setup Jubatus in cluster</a><ul>
<li><a class="reference internal" href="#jubavisor-process-management-with-zookeeper">Jubavisor(Process Management with zookeeper)</a></li>
<li><a class="reference internal" href="#client-for-multi-process-jubatus-server">Client for multi process Jubatus Server</a></li>
</ul>
</li>
</ul>
<h4>Previous topic</h4>
<p class="topless"><a href="gettingstarted.html"
title="previous chapter">Getting Started</a></p>
<h4>Next topic</h4>
<p class="topless"><a href="references.html"
title="next chapter">References</a></p>
<h3>This Page</h3>
<ul class="this-page-menu">
<li><a href="_sources/tutorial.txt"
rel="nofollow">Show Source</a></li>
</ul>
<div id="searchbox" style="display: none">
<h3>Quick search</h3>
<form class="search" action="search.html" method="get">
<input type="text" name="q" size="18" class="keyword" value="keyword" onFocus="HideFormGuide(this);" onBlur="ShowFormGuide(this);" />
<input type="submit" value="Go" class="searchBtn" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
<p class="searchtip" style="font-size: 90%">
Enter search terms or a module, class or function name.
</p>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
<script src="http://widgets.twimg.com/j/2/widget.js"></script>
<script>
new TWTR.Widget({
version: 2,
type: 'profile',
rpp: 4,
interval: 30000,
width: 220,
height: 300,
theme: {
shell: {
background: '#333333',
color: '#ffffff'
},
tweets: {
background: '#000000',
color: '#ffffff',
links: '#4aed05'
}
},
features: {
scrollbar: false,
loop: false,
live: false,
behavior: 'all'
}
}).render().setUser('JubatusOfficial').start();
</script>
</div>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body">
<p>This tutorial introduces the basic usage of the Jubatus framework.</p>
<div class="section" id="overview-and-scenario">
<h1>Overview and Scenario<a class="headerlink" href="#overview-and-scenario" title="Permalink to this headline">¶</a></h1>
<p>This tutorial made up of following four sections:</p>
<ul class="simple">
<li>Setup a single process Jubatus Server<ul>
<li>install Jubatus</li>
</ul>
</li>
<li>How to use Jubatus<ul>
<li>Prepairing dataset</li>
<li>Set configure</li>
<li>Train/Classify</li>
</ul>
</li>
<li>Setup multi process Jubatus Server<ul>
<li>Setup ZooKeeper</li>
<li>jubakeeper</li>
<li>run two process with local_mixture</li>
</ul>
</li>
<li>Setup Jubatus in cluster<ul>
<li>jubavisor(Process Management with zookeeper)</li>
<li>Processes Management with zookeeper</li>
<li>Client for multi process Jubatus Server</li>
</ul>
</li>
</ul>
<p>このチュートリアルでは、自然言語の分類に対する評価用データとして有名な、 <a class="reference external" href="http://people.csail.mit.edu/jrennie/20Newsgroups/">News20(20news-bydate.tar.gz)</a> を利用します。
News20では、話題が20個のnewsgroupに分かれており、人々は自分が適していると思ったnewsgroupに投稿します。
News20は便宜上、80%の学習用データ(20news-bydate-train)と、20%の実験用データ(20news-bydata-test)の二種類に分けられています。
このチュートリアルの目的は、学習用データを(投稿先newsgroup, 投稿内容)のセットとして学習し、テスト用データ(投稿内容)から、投稿先newsgroupを推測することです。</p>
<div class="section" id="prequisites">
<h2>Prequisites<a class="headerlink" href="#prequisites" title="Permalink to this headline">¶</a></h2>
<p>This tutorial requires following softwares installed:</p>
<ul class="simple">
<li>Linux 2.6 +</li>
<li>gcc 4.0 +</li>
<li>pkg-config</li>
<li>python 2.6+ and <a class="reference external" href="http://pypi.python.org/pypi/msgpack-python/">msgpack-python</a></li>
<li><a class="reference external" href="http://msgpack.org">libmsgpack</a></li>
<li><a class="reference external" href="http://github.com/pfi/pficommon">pficommon</a> - must be configured with msgpack enabled.</li>
<li><a class="reference external" href="http://code.google.com/p/re2/">re2</a></li>
<li><a class="reference external" href="http://code.google.com/p/google-glog/">google-glog</a></li>
<li><a class="reference external" href="http://code.google.com/p/ux-trie/">ux-trie</a> / <a class="reference external" href="http://mecab.sourceforge.net/">MeCab</a> (optional)</li>
<li><a class="reference external" href="http://zookeeper.apache.org/">ZooKeeper</a> server and C client (optional, for multiple processes)</li>
</ul>
</div>
</div>
<div class="section" id="setup-a-single-process-jubatus-server">
<h1>Setup a single process Jubatus Server<a class="headerlink" href="#setup-a-single-process-jubatus-server" title="Permalink to this headline">¶</a></h1>
<p>ここでは、JubatusをCentOS 5.2にインストールするための手順を示します。 他のdisutributionを利用する場合は、適時読み替えてください。</p>
<div class="section" id="building-and-installing-jubatus">
<h2>building and installing Jubatus<a class="headerlink" href="#building-and-installing-jubatus" title="Permalink to this headline">¶</a></h2>
<p>上記を事前にインストールした上で、Jubatusをbuild, installしてください。</p>
<div class="highlight-python"><pre>$ git clone git@github.com:jubatus/jubatus.git
$ cd jubatus
$ ./waf configure
$ ./waf build
# ./waf install
$ jubaclassifier --name tutorial</pre>
</div>
<p>これで、1プロセス構成でのセットアップと起動が完了しました。</p>
<p><tt class="docutils literal"><span class="pre">--name</span></tt> オプションは、複数のJubatusプロセスを利用する際に使います。
詳しくは、 <a class="reference internal" href="#multiprocess"><em>Setup Jubatus Server with multiple processes</em></a> を参照してください。</p>
<p>JubatusはデフォルトでMessagePack RPCサーバとして9199ポートで待ち受けます。
他のサービスなどが同じポートを利用している場合は、オプションを利用して、他のポートを利用してください。
例えば、RPCサーバを9181番で待ち受ける場合は、</p>
<div class="highlight-python"><pre>$ jubaclassfier --rpc-port=9181 --name=tutorial</pre>
</div>
<p>になります。</p>
</div>
<div class="section" id="installing-python-client-for-jubatus">
<h2>installing Python client for Jubatus<a class="headerlink" href="#installing-python-client-for-jubatus" title="Permalink to this headline">¶</a></h2>
<p>Python client of Jubatus requires <a class="reference external" href="http://pypi.python.org/pypi/msgpack-python/">msgpack-python</a>.</p>
<div class="highlight-python"><pre>$ git clone git@github.com:jubatus/jubatus-python.git
$ cd jubatus-python
$ python setup.py build
# python setup.py install</pre>
</div>
<p>Check your python installation by running tests:</p>
<div class="highlight-python"><pre>$ python setup.py test</pre>
</div>
</div>
</div>
<div class="section" id="sending-query-to-a-jubatus-server">
<h1>Sending query to a Jubatus server<a class="headerlink" href="#sending-query-to-a-jubatus-server" title="Permalink to this headline">¶</a></h1>
<div class="section" id="prepairing-dataset">
<h2>Prepairing dataset<a class="headerlink" href="#prepairing-dataset" title="Permalink to this headline">¶</a></h2>
<p>20news-bydate.tar.gzを展開すると、</p>
<div class="highlight-python"><pre>-20news-bydate-train
alt.atheism
49960
51060
...
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc</pre>
</div>
<p>のファイル群が展開されます。数値49960がファイル名で、newsgroup名がalt.atheismになります。
例えば、20news-bydate-train/rec.motorcycles/104435の中身は、</p>
<div class="highlight-python"><pre>From: karr@cs.cornell.edu (David Karr)
Subject: Re: BMW MOA members read this!
Organization: Cornell Univ. CS Dept, Ithaca NY 14853
Lines: 19
In article <C5Joz9.HLn@cup.hp.com> Chris Steinbroner <hesh@cup.hp.com> writes:
>Wm. L. Ranck (ranck@joesbar.cc.vt.edu) wrote:
>: As a new BMW owner I was thinking about signing up for the MOA, but
>: right now it is beginning to look suspiciously like throwing money
>: down a rathole.
>
>[...] i'm going to
>let my current membership lapse when it's
>up for renewal.
>
>-- hesh
In my case that's not for another 3+ years, so I'd appreciate any
hints on what will keep the organization in business that long. (And
preferably longer, of course, and worth being part of.)
-- David Karr (karr@cs.cornell.edu)</pre>
</div>
<p>のようなテキストファイルです。
これらのテキストファイルを学習データとして利用します。</p>
</div>
<div class="section" id="set-configure">
<h2>Set configure<a class="headerlink" href="#set-configure" title="Permalink to this headline">¶</a></h2>
<p>jubaclassifierは、method, converterのオプションを外部からのqueryで指定することによって、動作を指定することが出来ます。オプションのプロトタイプは、以下のとおりです。</p>
<div class="highlight-python"><div class="highlight"><pre><span class="n">config</span> <span class="o">=</span> <span class="p">{</span>
<span class="s">'converter'</span><span class="p">:</span> <span class="p">{</span>
<span class="s">'string_filter_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'string_filter_rules'</span><span class="p">:</span> <span class="p">[],</span>
<span class="s">'num_filter_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'num_filter_rules'</span><span class="p">:</span> <span class="p">[],</span>
<span class="s">'string_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'string_rules'</span><span class="p">:</span> <span class="p">[],</span>
<span class="s">'num_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'num_rules'</span><span class="p">:</span> <span class="p">[]</span>
<span class="p">},</span>
<span class="s">'method'</span><span class="p">:</span> <span class="s">''</span>
<span class="p">}</span>
</pre></div>
</div>
<p><tt class="docutils literal"><span class="pre">'method'</span></tt> は、以下のアルゴリズムのうちいずれかを指定することが出来ます。</p>
<ul class="simple">
<li><tt class="docutils literal"><span class="pre">perceptron</span></tt></li>
<li><tt class="docutils literal"><span class="pre">PA</span></tt>, <tt class="docutils literal"><span class="pre">PA1</span></tt>, <tt class="docutils literal"><span class="pre">PA2</span></tt></li>
<li><tt class="docutils literal"><span class="pre">CW</span></tt></li>
<li><tt class="docutils literal"><span class="pre">AROW</span></tt></li>
<li><tt class="docutils literal"><span class="pre">NHERD</span></tt></li>
</ul>
<p>今回は、 <tt class="docutils literal"><span class="pre">PA</span></tt> を選択します。</p>
<p><tt class="docutils literal"><span class="pre">'converter'</span></tt> は、入力データをどのように加工して、特徴ベクトルに変換するのかを指定します。</p>
<p>今回は、自然言語のテキストです。
英語など多くの言語は、<space>, <Return>で単語に分割出来るので、単語化して特徴ベクトルにすることにしましょう。
また、HTMLタグなどは、内容を分類するのにノイズになりそうなので、”<>”で囲まれた部分を除去することにしましょう。</p>
<p>こういった自然言語処理、与えられた値の重み付けなど、様々なルール付けを行うことが出来ます。
今回のルールをPythonオブジェクトで表現すると、以下のようになります。</p>
<div class="highlight-python"><pre>config = {
'converter': {
"string_filter_types": {
"detag": { "method": "regexp", "pattern": "<[^>]*>", "replace": "" }
},
"string_filter_rules":
[
{ "key": "message", "type": "detag", "suffix": "-detagged" }
],
'num_filter_types': {},
'num_filter_rules': [],
'string_types': {},
'string_rules': [
{'key': 'message-detagged', 'type': "space", "sample_weight": "bin", "global_weight": "bin"}
],
'num_types': {},
'num_rules': []
},
'method': 'PA'
}</pre>
</div>
<p><tt class="docutils literal"><span class="pre">get_config</span></tt> に対してRPC呼び出しを行うと、現在指定されているオプションが返ってきます。</p>
</div>
<div class="section" id="train-classify">
<h2>Train/Classify<a class="headerlink" href="#train-classify" title="Permalink to this headline">¶</a></h2>
<p>学習器に学習させる場合は、 <tt class="docutils literal"><span class="pre">train</span></tt> というAPIを利用します。</p>
<div class="highlight-python"><div class="highlight"><pre><span class="n">train_dat</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">(</span>
<span class="s">"comp.windows.x"</span><span class="p">,</span>
<span class="p">[</span>
<span class="p">[</span><span class="s">"message"</span> <span class="p">,</span> <span class="s">"some messages about windows..."</span><span class="p">],</span>
<span class="p">[</span><span class="s">"from"</span> <span class="p">,</span> <span class="s">"hoge@n.tt"</span><span class="p">]</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="p">(</span>
<span class="s">"comp.sys.mac.hardware"</span><span class="p">,</span>
<span class="p">[</span><span class="s">"message"</span> <span class="p">,</span> <span class="s">"I want to buy a new mac book air..."</span><span class="p">,]</span>
<span class="p">)</span>
<span class="p">]</span>
</pre></div>
</div>
<p>推定させる場合は、 <tt class="docutils literal"><span class="pre">classify</span></tt> というAPIを利用します。</p>
<div class="highlight-python"><div class="highlight"><pre><span class="n">classify_dat</span> <span class="o">=</span> <span class="p">[[</span>
<span class="p">[</span><span class="s">"some messages about windows..."</span><span class="p">],</span>
<span class="p">[</span><span class="s">"I bought a new mac book air..."</span><span class="p">],</span>
<span class="p">]]</span>
</pre></div>
</div>
<p>その結果は、以下のような値が得られます。</p>
<div class="highlight-python"><div class="highlight"><pre><span class="p">[[</span>
<span class="p">[</span><span class="s">"alt.atheism"</span><span class="p">,</span> <span class="mf">1.10477745533</span><span class="p">],</span>
<span class="o">...</span>
<span class="p">[</span><span class="s">"rec.sport.hockey"</span><span class="p">,</span> <span class="mf">2.0973217487300002</span><span class="p">],</span>
<span class="p">[</span><span class="s">"comp.os.ms-windows.misc"</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.065333858132400002</span><span class="p">],</span>
<span class="p">[</span><span class="s">"sci.electronics"</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.184129983187</span><span class="p">],</span>
<span class="p">[</span><span class="s">"talk.religion.misc"</span><span class="p">,</span> <span class="o">-</span><span class="mf">0.092822007834899994</span><span class="p">]</span>
<span class="p">]]</span>
</pre></div>
</div>
<p>それぞれのラベルごとの確率が出るので、この中で一番大きい値を提示すればおそらくそれは正しい分類でしょう。
JubatusはMessagePack-RPCを利用できるあらゆる言語から利用することが出来ます。最後に、pythonのコードを示します。</p>
<div class="highlight-python"><div class="highlight"><pre><span class="c">#!/usr/bin/env python</span>
<span class="c"># -*- coding: utf-8 -*-</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">jubatus</span>
<span class="k">def</span> <span class="nf">parse_args</span><span class="p">():</span>
<span class="kn">from</span> <span class="nn">optparse</span> <span class="kn">import</span> <span class="n">OptionParser</span><span class="p">,</span> <span class="n">OptionValueError</span>
<span class="n">p</span> <span class="o">=</span> <span class="n">OptionParser</span><span class="p">()</span>
<span class="n">p</span><span class="o">.</span><span class="n">add_option</span><span class="p">(</span><span class="s">'-s'</span><span class="p">,</span> <span class="s">'--server_list'</span><span class="p">,</span> <span class="n">action</span><span class="o">=</span><span class="s">'store'</span><span class="p">,</span>
<span class="n">dest</span><span class="o">=</span><span class="s">'server_list'</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s">'string'</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="s">'localhost:9199'</span><span class="p">)</span>
<span class="n">p</span><span class="o">.</span><span class="n">add_option</span><span class="p">(</span><span class="s">'-n'</span><span class="p">,</span> <span class="s">'--name'</span><span class="p">,</span> <span class="n">action</span><span class="o">=</span><span class="s">'store'</span><span class="p">,</span>
<span class="n">dest</span><span class="o">=</span><span class="s">'name'</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s">'string'</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="s">'test'</span><span class="p">)</span>
<span class="n">p</span><span class="o">.</span><span class="n">add_option</span><span class="p">(</span><span class="s">'-a'</span><span class="p">,</span> <span class="s">'--algo'</span><span class="p">,</span> <span class="n">action</span><span class="o">=</span><span class="s">'store'</span><span class="p">,</span>
<span class="n">dest</span><span class="o">=</span><span class="s">'algo'</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s">'string'</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="s">"PA"</span><span class="p">)</span>
<span class="k">return</span> <span class="n">p</span><span class="o">.</span><span class="n">parse_args</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">get_most_likely</span><span class="p">(</span><span class="n">estm</span><span class="p">):</span>
<span class="n">ans</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">prob</span> <span class="o">=</span> <span class="bp">None</span>
<span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
<span class="n">result</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="s">''</span>
<span class="n">result</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">res</span> <span class="ow">in</span> <span class="n">estm</span><span class="p">:</span>
<span class="k">if</span> <span class="n">prob</span> <span class="o">==</span> <span class="bp">None</span> <span class="ow">or</span> <span class="n">res</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">></span> <span class="n">prob</span> <span class="p">:</span>
<span class="n">ans</span> <span class="o">=</span> <span class="n">res</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">prob</span> <span class="o">=</span> <span class="n">res</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="n">result</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="n">ans</span>
<span class="n">result</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">prob</span>
<span class="k">return</span> <span class="n">result</span>
<span class="k">if</span> <span class="n">__name__</span> <span class="o">==</span> <span class="s">'__main__'</span><span class="p">:</span>
<span class="n">options</span><span class="p">,</span> <span class="n">remainder</span> <span class="o">=</span> <span class="n">parse_args</span><span class="p">()</span>
<span class="n">classifier</span> <span class="o">=</span> <span class="n">jubatus</span><span class="o">.</span><span class="n">Classifier</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">server_list</span><span class="p">,</span> <span class="n">options</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="n">config</span> <span class="o">=</span> <span class="p">{</span>
<span class="s">'converter'</span><span class="p">:</span> <span class="p">{</span>
<span class="s">"string_filter_types"</span><span class="p">:</span> <span class="p">{</span>
<span class="s">"detag"</span><span class="p">:</span> <span class="p">{</span> <span class="s">"method"</span><span class="p">:</span> <span class="s">"regexp"</span><span class="p">,</span> <span class="s">"pattern"</span><span class="p">:</span> <span class="s">"<[^>]*>"</span><span class="p">,</span> <span class="s">"replace"</span><span class="p">:</span> <span class="s">""</span> <span class="p">}</span>
<span class="p">},</span>
<span class="s">"string_filter_rules"</span><span class="p">:</span>
<span class="p">[</span>
<span class="p">{</span> <span class="s">"key"</span><span class="p">:</span> <span class="s">"message"</span><span class="p">,</span> <span class="s">"type"</span><span class="p">:</span> <span class="s">"detag"</span><span class="p">,</span> <span class="s">"suffix"</span><span class="p">:</span> <span class="s">"-detagged"</span> <span class="p">}</span>
<span class="p">],</span>
<span class="s">'num_filter_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'num_filter_rules'</span><span class="p">:</span> <span class="p">[],</span>
<span class="s">'string_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'string_rules'</span><span class="p">:</span> <span class="p">[</span>
<span class="p">{</span><span class="s">'key'</span><span class="p">:</span> <span class="s">'message-detagged'</span><span class="p">,</span> <span class="s">'type'</span><span class="p">:</span> <span class="s">"space"</span><span class="p">,</span> <span class="s">"sample_weight"</span><span class="p">:</span> <span class="s">"bin"</span><span class="p">,</span> <span class="s">"global_weight"</span><span class="p">:</span> <span class="s">"bin"</span><span class="p">}</span>
<span class="p">],</span>
<span class="s">'num_types'</span><span class="p">:</span> <span class="p">{},</span>
<span class="s">'num_rules'</span><span class="p">:</span> <span class="p">[]</span>
<span class="p">},</span>
<span class="s">'method'</span><span class="p">:</span> <span class="n">options</span><span class="o">.</span><span class="n">algo</span><span class="p">,</span>
<span class="p">}</span>
<span class="n">classifier</span><span class="o">.</span><span class="n">set_config</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
<span class="k">print</span> <span class="n">classifier</span><span class="o">.</span><span class="n">get_config</span><span class="p">()</span>
<span class="k">print</span> <span class="n">classifier</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
<span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="nb">open</span><span class="p">(</span><span class="s">'train.dat'</span><span class="p">):</span>
<span class="n">label</span><span class="p">,</span> <span class="nb">file</span> <span class="o">=</span> <span class="n">line</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">','</span><span class="p">)</span>
<span class="n">dat</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="nb">file</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
<span class="n">classifier</span><span class="o">.</span><span class="n">train</span><span class="p">(</span>
<span class="p">[(</span> <span class="n">label</span> <span class="p">,</span> <span class="p">([[</span><span class="s">"message"</span><span class="p">,</span> <span class="n">dat</span><span class="p">]],</span> <span class="p">)</span> <span class="p">,)]</span>
<span class="p">)</span>
<span class="k">print</span> <span class="n">classifier</span><span class="o">.</span><span class="n">get_status</span><span class="p">()</span>
<span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="nb">open</span><span class="p">(</span><span class="s">'test.dat'</span><span class="p">):</span>
<span class="n">label</span><span class="p">,</span> <span class="nb">file</span> <span class="o">=</span> <span class="n">line</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">','</span><span class="p">)</span>
<span class="n">dat</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="nb">file</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
<span class="n">ans</span> <span class="o">=</span> <span class="n">classifier</span><span class="o">.</span><span class="n">classify</span><span class="p">(</span>
<span class="p">[([[</span><span class="s">"message"</span><span class="p">,</span> <span class="n">dat</span><span class="p">]],</span> <span class="p">)]</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">ans</span> <span class="o">!=</span> <span class="bp">None</span><span class="p">:</span>
<span class="n">estm</span> <span class="o">=</span> <span class="n">get_most_likely</span><span class="p">(</span><span class="n">ans</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">if</span> <span class="p">(</span><span class="n">label</span> <span class="o">==</span> <span class="n">estm</span><span class="p">[</span><span class="mi">0</span><span class="p">]):</span>
<span class="n">result</span> <span class="o">=</span> <span class="s">"OK"</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="s">"NG"</span>
<span class="k">print</span> <span class="n">result</span> <span class="o">+</span> <span class="s">","</span> <span class="o">+</span> <span class="n">label</span> <span class="o">+</span> <span class="s">", "</span> <span class="o">+</span> <span class="n">estm</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="s">", "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">estm</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
</pre></div>
</div>
<p><tt class="docutils literal"><span class="pre">train.dat</span></tt>, <tt class="docutils literal"><span class="pre">test.dat</span></tt> というファイルを作り、</p>
<div class="highlight-python"><pre>ラベル名,ファイルパス</pre>
</div>
<p>と各行に書き込み、次のようにして利用します。</p>
<div class="highlight-python"><pre>$ python tutorial.py -s localhost:9199 -n tutorial2</pre>
</div>
<p>以上で、下記の構成でJubatusを実行しました。</p>
<div class="figure">
<img alt="single client, single server" src="_images/single_single.png" style="width: 70%;" />
</div>
</div>
</div>
<div class="section" id="setup-jubatus-server-with-multiple-processes">
<span id="multiprocess"></span><h1>Setup Jubatus Server with multiple processes<a class="headerlink" href="#setup-jubatus-server-with-multiple-processes" title="Permalink to this headline">¶</a></h1>
<p>Jubatusでは、Zookeeperを用いて複数のサーバプロセス間を強調させることで、分散処理を行うことが出来ます。</p>
<div class="section" id="setup-zookeeper">
<h2>Setup ZooKeeper<a class="headerlink" href="#setup-zookeeper" title="Permalink to this headline">¶</a></h2>
<div class="highlight-python"><pre>$ cd /path/to/zookeeper
$ bin/zkServer.sh start
JMX enabled by default
Using config: /zookeeper-3.3.3/bin/../conf/zoo.cfg
Starting zookeeper ...
STARTED
...</pre>
</div>
<p>以後、zoo.cfgでの指定によりローカルマシンのポート2181で起動していることを想定します。</p>
</div>
<div class="section" id="jubakeeper">
<h2>jubakeeper<a class="headerlink" href="#jubakeeper" title="Permalink to this headline">¶</a></h2>
<p>jubakeeperは、Jubatus内でクライアントからサーバ群へアクセスするためのインターフェースとなるプロセスです。
jubakeeperは、ZooKeeperを参照して、クライアントからのリクエストをclassifierへ仲介します。</p>
<div class="highlight-python"><pre>$ jubakeeper --zookeeper=localhost:2181 --rpc-port=9198</pre>
</div>
<p>これにより、jubakeeperは、9198ポートでRPCを待ち受けます。
jubakeeperを介した場合、起動しているサーバを意識することなくスケールアウトするように実装されています。</p>
</div>
<div class="section" id="running-two-processes-as-one-classifier-instance">
<h2>Running two processes as one classifier instance<a class="headerlink" href="#running-two-processes-as-one-classifier-instance" title="Permalink to this headline">¶</a></h2>
<p>3並列でクライアントからのリクエストを受け付けたい場合は、jubaclassifierを3つ起動します。
<tt class="docutils literal"><span class="pre">--name</span></tt> で同じ名前を指定することにより、3つのプロセスがひとつのインスタンスとして強調動作します。
同じマシン内で複数プロセスを起動する場合は、プロセスごとにポートを変えなければならないことに注意してください。</p>
<div class="highlight-python"><pre>$ jubaclassifier --rpc-port=9180 --name=tutorial2 --zookeeper=localhost:2181 --storage=local_mixture &
$ jubaclassifier --rpc-port=9181 --name=tutorial2 --zookeeper=localhost:2181 --storage=local_mixture &
$ jubaclassifier --rpc-port=9182 --name=tutorial2 --zookeeper=localhost:2181 --storage=local_mixture &</pre>
</div>
<p>zookeeperのクライアントを用いて、たしかに二つのサーバプロセスが起動していることを確認することも出来ます。</p>
<div class="highlight-python"><pre>$ cd /path/to/zookeeper
$ bin/zkCli.sh -server localhost:2181
[zk: localhost:2181(CONNECTED) 0] ls /jubatus/actors/tutorial2/nodes
[XXX.XXX.XXX.XXX_9180, XXX.XXX.XXX.XXX__9181, XXX.XXX.XXX.XXX__9182]</pre>
</div>
<p>以上で、下記の構成でJubatusを実行しました。</p>
<div class="figure">
<img alt="single client, multi servers" src="_images/single_multi.png" style="width: 70%;" />
</div>
</div>
</div>
<div class="section" id="setup-jubatus-in-cluster">
<h1>Setup Jubatus in cluster<a class="headerlink" href="#setup-jubatus-in-cluster" title="Permalink to this headline">¶</a></h1>
<p>Jubatusは各種プロセスを一括管理するための仕組みを備えています。</p>
<p>今、それぞれのサーバに対して、以下の表に対応したプロセスを起動させることを考えます。</p>
<table border="1" class="docutils">
<colgroup>
<col width="35%" />
<col width="65%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">IP address</th>
<th class="head">processes</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>192.168.0.1</td>
<td>操作端末</td>
</tr>
<tr class="row-odd"><td>192.168.0.10</td>
<td>classifier - 1</td>
</tr>
<tr class="row-even"><td>192.168.0.20</td>
<td>classifier - 2</td>
</tr>
<tr class="row-odd"><td>192.168.0.30</td>
<td>classifier - 3</td>
</tr>
<tr class="row-even"><td>192.168.0.100</td>
<td>jubakeeper/zookeeper - 1</td>
</tr>
<tr class="row-odd"><td>192.168.0.200</td>
<td>jubakeeper/zookeeper - 2</td>
</tr>
</tbody>
</table>
<div class="highlight-python"><pre>[192.168.0.100]$ bin/zkServer.sh start
[192.168.0.200]$ bin/zkServer.sh start</pre>
</div>
<p>zookeeperをそれぞれで立ち上げます。zoo.confには二台で構成する設定を書いてください。
そして、クライアントから利用するためにjubakeeperを用意しておきます。jubakeeperはデフォルトで9198番ポートを利用します。</p>
<div class="highlight-python"><pre>[192.168.0.100]$ jubakeeper --zookeeper=192.168.0.100:2181,192.168.0.200:2181 -d
[192.168.0.200]$ jubakeeper --zookeeper=192.168.0.100:2181,192.168.0.200:2181 -d</pre>
</div>
<div class="section" id="jubavisor-process-management-with-zookeeper">
<h2>Jubavisor(Process Management with zookeeper)<a class="headerlink" href="#jubavisor-process-management-with-zookeeper" title="Permalink to this headline">¶</a></h2>
<p>jubavisorは、マシンごとに一プロセスずつ存在するagentで、Jubatusctrlからの司令を受けて同サーバ内のプロセスを管理します。
このプロセスは、予めマシンごとに起動しておく必要があります。jubavisorはデフォルトで9199番ポートを利用します。</p>
<div class="highlight-python"><pre>[192.168.0.10 ]$ jubavisor -z 192.168.0.100:2181,192.168.0.200:2181 -d
[192.168.0.20 ]$ jubavisor -z 192.168.0.100:2181,192.168.0.200:2181 -d
[192.168.0.30 ]$ jubavisor -z 192.168.0.100:2181,192.168.0.200:2181 -d</pre>
</div>
<p>jubavisorは、一台のサーバ内の複数プロセスのポートを調整して指定されたプロセスを指定された名前空間で起動し、zookeeperに登録します。
ここまで出来れば、後は操作端末から、自由にプロセスを管理することが出来ます。
Let’s provisioning!!</p>
<div class="highlight-python"><pre>[192.168.0.1 ]$ jubactl -c start --type=classifier --name=tutorial2 -z 192.168.0.100:2181,192.168.0.200:2181
[192.168.0.1 ]$ jubactl --name=tutorial2 --zookeeper=192.168.0.100:2181,192.168.0.200:2181 --type=classifier -c status
active jubakeeper members:
192.168.0.100_9198
192.168.0.200_9198
active jubavisor members:
192.168.0.10_9199
192.168.0.20_9199
192.168.0.30_9199
active tutorial2 members:
192.168.0.10_9180
192.168.0.20_9180
192.168.0.30_9180</pre>
</div>
<div class="highlight-python"><pre>[192.168.0.1 ]$ jubactl -c stop --type=classifier --name=tutorial2 -z 192.168.0.100:2181,192.168.0.200:2181</pre>
</div>
</div>
<div class="section" id="client-for-multi-process-jubatus-server">
<h2>Client for multi process Jubatus Server<a class="headerlink" href="#client-for-multi-process-jubatus-server" title="Permalink to this headline">¶</a></h2>
<p>最後に、複数クライアント、複数サーバ環境でtutorialを実行しましょう。</p>
<table border="1" class="docutils">
<colgroup>
<col width="35%" />
<col width="65%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">IP address</th>
<th class="head">processes</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>192.168.0.1</td>
<td>操作端末</td>
</tr>
<tr class="row-odd"><td>192.168.0.2</td>
<td>client - 1</td>
</tr>
<tr class="row-even"><td>192.168.0.3</td>
<td>client - 2</td>
</tr>
<tr class="row-odd"><td>192.168.0.3</td>
<td>client - 3</td>
</tr>
<tr class="row-even"><td>192.168.0.10</td>
<td>classifier - 1</td>
</tr>
<tr class="row-odd"><td>192.168.0.20</td>
<td>classifier - 2</td>
</tr>
<tr class="row-even"><td>192.168.0.30</td>
<td>classifier - 3</td>
</tr>
<tr class="row-odd"><td>192.168.0.100</td>
<td>jubakeeper/zookeeper - 1</td>
</tr>
<tr class="row-even"><td>192.168.0.200</td>
<td>jubakeeper/zookeeper - 2</td>
</tr>
</tbody>
</table>
<div class="highlight-python"><pre>[192.168.0.1 ]$ jubactl -c start --type=classifier --name=tutorial3 -z 192.168.0.100:2181,192.168.0.200:2181
[192.168.0.2 ]$ python tutorial.py --name=tutorial3 -s 192.168.0.100:9198,192.168.0.200:9198
[192.168.0.3 ]$ python tutorial.py --name=tutorial3 -s 192.168.0.100:9198,192.168.0.200:9198</pre>
</div>
<p>以上で、下記の構成でJubatusを実行しました。</p>
<div class="figure">
<img alt="multi clients, multi servers" src="_images/multi_multi.png" style="width: 70%;" />
</div>
<p>以上でチュートリアルは終わりです。</p>
</div>
</div>
</div>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="genindex.html" title="General Index"
>index</a></li>
<li class="right" >
<a href="references.html" title="References"
>next</a> |</li>
<li class="right" >
<a href="gettingstarted.html" title="Getting Started"
>previous</a> |</li>
<li><a href="index.html">Jubatus 0.1.0 documentation</a> »</li>
<li><a href="gettingstarted.html" >Getting Started</a> »</li>
</ul>
</div>
<div class="footer">
© Copyright 2011, PFI&NTT.
Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> 1.1.
</div>
</body>
</html>