集合知プログラミング 3.3をperl化してみた(形態素解析+単語数カウント)
#!/usr/bin/perl package FeedParser; use LWP::UserAgent; use XML::Simple; sub new { return bless {}, shift; } sub parse { my $self = shift; my $url = shift; my $ua = LWP::UserAgent->new; my $feed = XMLin($ua->simple_request( HTTP::Request->new('GET', $url) )->content); return $feed; } 1; package main; use utf8; use strict; use warnings; use Dumpvalue; use MeCab; my $mecab = MeCab::Tagger->new; my $parser = FeedParser->new; sub getWordCounts { my $url = shift; my %wc = (); # parse xml data my $documents = $parser->parse($url); # analysis entry data foreach my $entry(@{$documents->{item}}) { my $summary; if ($entry->{summary}) { $summary = $entry->{summary}; } elsif($entry->{description}) { $summary = $entry->{description}; } else { $summary = $entry->{content}; } # extract attributes my $node = getWords($summary); for (;$node;$node = $node->{next}) { print "-"; next unless defined $node->{surface}; my $word = $node->{surface}; my($hinsi, $yomi) = (split( /,/, $node->{feature}))[0,7]; next if defined $yomi && $yomi eq '*'; unless (defined $wc{$word}) { $wc{$word} = 0; } $wc{$word}++; } } return $documents->{title}, \%wc; } sub getWords { my $content = shift; # triming html tags. $content =~ s/\<(.*?)\>//g; # lower case $content = lc($content); # return results. return $mecab->parseToNode($content); } # do execute my($title, $list) = getWordCounts('http://d.hatena.ne.jp/rin1024/rss'); Dumpvalue->new->dumpValue($list); 1;
3.3の記事とちょっと変えてはてダのRSSのフィードから、
単語の出現数をカウントするのにしてみた。
実行結果はこんなん。
まだ英単語をlcで小文字に統一する位しかしていないので、ノイズ多いです。
$ perl generatefeedvector.pl '#!/' => 4 '%' => 4 '(' => 7 '()' => 1 '();' => 1 ')' => 1 ',' => 9 '-' => 4 '.' => 19 '/' => 13 '//' => 1 0 => 1 2 => 4 3 => 4 5 => 7 62 => 1 ':' => 1 '://' => 2 ';' => 15 '=' => 6 '=&#' => 1 '=>' => 12 'a' => 5 'add' => 1 'and' => 1 'b' => 1 'bin' => 4 'cakeonvim' => 1 'cakephp' => 1 'codecheck' => 1 'com' => 1 'du' => 1 'dumpvalue' => 4 'edition' => 1 'euclid' => 1 'fi' => 1 'function' => 1 'home' => 1 'http' => 2 'in' => 5 'just' => 1 'lady' => 4 'let' => 1 'lisa' => 4 'luck' => 1 'me' => 1 'mixi' => 1 'my' => 5 'newdatarequest' => 1 'newfe' => 1 'on' => 4 'opensocial' => 1 'pc' => 1 'pearson' => 2 'perl' => 7 'pl' => 3 'plane' => 4 'prefs' => 4 'professi' => 1 'professional' => 1 'python' => 3 'rails' => 1 'real' => 1 'realvnc' => 1 'request' => 2 'require' => 3 'returns' => 1 'rose' => 4 's' => 1 'share' => 1 'snakes' => 4 'strict' => 4 'superman' => 1 'the' => 4 'toby' => 1 'trac' => 1 'use' => 12 'usr' => 4 'var' => 1 'vim' => 2 'vnc' => 1 'w' => 4 'warnings' => 4 'water' => 4 'wiki' => 1 'windows' => 1 'www' => 1 'xp' => 1 'you' => 1 '{' => 5 '’' => 35 '…' => 1 '、' => 16 '。' => 12 'あっ' => 2 'あんま' => 1 'うだうだ' => 1 'お' => 1 'おすすめ' => 1 'か' => 2 'かかれ' => 3 'から' => 1 'が' => 5 'ぎゃ' => 1 'くらい' => 1 'くれる' => 1 'けど' => 2 'これ' => 2 'さ' => 2 'さすが' => 1 'さん' => 2 'し' => 9 'する' => 1 'そう' => 1 'その後' => 2 'た' => 13 'たら' => 2 'だ' => 5 'ちょっと' => 1 'って' => 2 'っと' => 1 'つくら' => 1 'て' => 22 'てる' => 3 'で' => 11 'できる' => 1 'です' => 1 'でも' => 1 'と' => 8 'とか' => 3 'とかし' => 1 'とりあえず' => 2 'ど' => 1 'どれ' => 1 'な' => 2 'ない' => 7 'なんだか' => 1 'なー' => 3 'に' => 4 'の' => 14 'ので' => 6 'のに' => 1 'は' => 6 'ぷらぎんで' => 1 'ぼけ' => 1 'まぁ' => 1 'まし' => 1 'ます' => 1 'み' => 4 'みたい' => 1 'めんどくさい' => 1 'も' => 1 'もの' => 1 'もんもん' => 1 'やっぱり' => 1 'やつ' => 1 'れ' => 2 'を' => 7 'ん' => 3 'アシスタント' => 1 'カウント' => 1 'コミュニティ' => 6 'セットアップ' => 1 'ゼミ' => 1 'ダメ' => 1 'デスク' => 1 'トップ' => 1 'ハード' => 1 'ピアソン' => 1 'ボタン' => 1 'マイミク' => 1 'メール' => 1 'ユークリッド' => 1 'ユーザー' => 1 'リモート' => 2 'ログイン' => 1 '・' => 2 'ー' => 2 '一' => 1 '一覧' => 1 '下調べ' => 1 '不具合' => 1 '且つ' => 1 '中' => 1 '人' => 2 '仕事' => 1 '以下' => 1 '以前' => 1 '会社' => 1 '作っ' => 2 '使い方' => 1 '使用' => 1 '側' => 2 '僕' => 1 '入っ' => 1 '全然' => 1 '分から' => 1 '初' => 1 '割る' => 1 '化' => 1 '双方' => 1 '取得' => 1 '同じ' => 1 '大学' => 1 '大杉' => 1 '大盤振る舞い' => 1 '室' => 1 '工業' => 1 '度' => 1 '思っ' => 3 '悔しい' => 1 '成績' => 1 '承諾' => 1 '押さ' => 1 '抽出' => 1 '接続' => 1 '放置' => 1 '昨日' => 1 '最近' => 1 '無い' => 1 '直接' => 1 '相手' => 2 '相関' => 1 '知ら' => 1 '研究' => 1 '突っ' => 1 '立っ' => 1 '系' => 1 '紹介' => 1 '総数' => 2 '考慮' => 1 '者' => 4 '聴か' => 1 '若干' => 1 '表示' => 1 '被っ' => 1 '覚え' => 1 '計算' => 1 '訪問' => 3 '詳しい' => 1 '誤差' => 1 '説明' => 1 '調べ' => 1 '距離' => 1 '近い' => 1 '近似' => 1 '送っ' => 1 '途中' => 1 '重複' => 1 '関数' => 1 '閲覧' => 2 '面倒' => 1 '面白' => 1 '面白く' => 1 '!' => 4 'ry' => 1