集合知プログラミング 3.3をperl化してみた(形態素解析+単語数カウント)

#!/usr/bin/perl

package FeedParser;

use LWP::UserAgent;
use XML::Simple;

sub new {
  return bless {}, shift;
}

sub parse {
  my $self = shift;
  my $url = shift;
  my $ua  = LWP::UserAgent->new;
  my $feed = XMLin($ua->simple_request(
      HTTP::Request->new('GET', $url)
    )->content);
  return $feed;
}

1;

package main;

use utf8;
use strict;
use warnings;
use Dumpvalue;
use MeCab;

my $mecab  = MeCab::Tagger->new;
my $parser = FeedParser->new;

sub getWordCounts {
  my $url = shift;
  my %wc = ();

  # parse xml data
  my $documents = $parser->parse($url);

  # analysis entry data
  foreach my $entry(@{$documents->{item}}) {
    my $summary;
    if ($entry->{summary}) {
      $summary = $entry->{summary};
    }
    elsif($entry->{description}) {
      $summary = $entry->{description};
    }
    else {
      $summary = $entry->{content};
    }

    # extract attributes
    my $node = getWords($summary);
    for (;$node;$node = $node->{next}) {
      print "-";
      next unless defined $node->{surface};
      my $word = $node->{surface};
      my($hinsi, $yomi) = (split( /,/, $node->{feature}))[0,7];
      next if defined $yomi && $yomi eq '*';
      unless (defined $wc{$word}) {
	$wc{$word} = 0;
      }
      $wc{$word}++;
    }
  }
  return $documents->{title}, \%wc;
}

sub getWords {
  my $content = shift;
  # triming html tags.
  $content =~ s/\<(.*?)\>//g;
  # lower case
  $content = lc($content);
  # return results.
  return $mecab->parseToNode($content);
}

# do execute
my($title, $list) = getWordCounts('http://d.hatena.ne.jp/rin1024/rss');
Dumpvalue->new->dumpValue($list);

1;

3.3の記事とちょっと変えてはてダのRSSのフィードから、
単語の出現数をカウントするのにしてみた。
実行結果はこんなん。
まだ英単語をlcで小文字に統一する位しかしていないので、ノイズ多いです。

$ perl generatefeedvector.pl
'#!/' => 4
'%' => 4
'(' => 7
'()' => 1
'();' => 1
')' => 1
',' => 9
'-' => 4
'.' => 19
'/' => 13
'//' => 1
0 => 1
2 => 4
3 => 4
5 => 7
62 => 1
':' => 1
'://' => 2
';' => 15
'=' => 6
'=&#' => 1
'=>' => 12
'a' => 5
'add' => 1
'and' => 1
'b' => 1
'bin' => 4
'cakeonvim' => 1
'cakephp' => 1
'codecheck' => 1
'com' => 1
'du' => 1
'dumpvalue' => 4
'edition' => 1
'euclid' => 1
'fi' => 1
'function' => 1
'home' => 1
'http' => 2
'in' => 5
'just' => 1
'lady' => 4
'let' => 1
'lisa' => 4
'luck' => 1
'me' => 1
'mixi' => 1
'my' => 5
'newdatarequest' => 1
'newfe' => 1
'on' => 4
'opensocial' => 1
'pc' => 1
'pearson' => 2
'perl' => 7
'pl' => 3
'plane' => 4
'prefs' => 4
'professi' => 1
'professional' => 1
'python' => 3
'rails' => 1
'real' => 1
'realvnc' => 1
'request' => 2
'require' => 3
'returns' => 1
'rose' => 4
's' => 1
'share' => 1
'snakes' => 4
'strict' => 4
'superman' => 1
'the' => 4
'toby' => 1
'trac' => 1
'use' => 12
'usr' => 4
'var' => 1
'vim' => 2
'vnc' => 1
'w' => 4
'warnings' => 4
'water' => 4
'wiki' => 1
'windows' => 1
'www' => 1
'xp' => 1
'you' => 1
'{' => 5
'’' => 35
'…' => 1
'、' => 16
'。' => 12
'あっ' => 2
'あんま' => 1
'うだうだ' => 1
'お' => 1
'おすすめ' => 1
'か' => 2
'かかれ' => 3
'から' => 1
'が' => 5
'ぎゃ' => 1
'くらい' => 1
'くれる' => 1
'けど' => 2
'これ' => 2
'さ' => 2
'さすが' => 1
'さん' => 2
'し' => 9
'する' => 1
'そう' => 1
'その後' => 2
'た' => 13
'たら' => 2
'だ' => 5
'ちょっと' => 1
'って' => 2
'っと' => 1
'つくら' => 1
'て' => 22
'てる' => 3
'で' => 11
'できる' => 1
'です' => 1
'でも' => 1
'と' => 8
'とか' => 3
'とかし' => 1
'とりあえず' => 2
'ど' => 1
'どれ' => 1
'な' => 2
'ない' => 7
'なんだか' => 1
'なー' => 3
'に' => 4
'の' => 14
'ので' => 6
'のに' => 1
'は' => 6
'ぷらぎんで' => 1
'ぼけ' => 1
'まぁ' => 1
'まし' => 1
'ます' => 1
'み' => 4
'みたい' => 1
'めんどくさい' => 1
'も' => 1
'もの' => 1
'もんもん' => 1
'やっぱり' => 1
'やつ' => 1
'れ' => 2
'を' => 7
'ん' => 3
'アシスタント' => 1
'カウント' => 1
'コミュニティ' => 6
'セットアップ' => 1
'ゼミ' => 1
'ダメ' => 1
'デスク' => 1
'トップ' => 1
'ハード' => 1
'ピアソン' => 1
'ボタン' => 1
'マイミク' => 1
'メール' => 1
'ユークリッド' => 1
'ユーザー' => 1
'リモート' => 2
'ログイン' => 1
'・' => 2
'ー' => 2
'一' => 1
'一覧' => 1
'下調べ' => 1
'不具合' => 1
'且つ' => 1
'中' => 1
'人' => 2
'仕事' => 1
'以下' => 1
'以前' => 1
'会社' => 1
'作っ' => 2
'使い方' => 1
'使用' => 1
'側' => 2
'僕' => 1
'入っ' => 1
'全然' => 1
'分から' => 1
'初' => 1
'割る' => 1
'化' => 1
'双方' => 1
'取得' => 1
'同じ' => 1
'大学' => 1
'大杉' => 1
'大盤振る舞い' => 1
'室' => 1
'工業' => 1
'度' => 1
'思っ' => 3
'悔しい' => 1
'成績' => 1
'承諾' => 1
'押さ' => 1
'抽出' => 1
'接続' => 1
'放置' => 1
'昨日' => 1
'最近' => 1
'無い' => 1
'直接' => 1
'相手' => 2
'相関' => 1
'知ら' => 1
'研究' => 1
'突っ' => 1
'立っ' => 1
'系' => 1
'紹介' => 1
'総数' => 2
'考慮' => 1
'者' => 4
'聴か' => 1
'若干' => 1
'表示' => 1
'被っ' => 1
'覚え' => 1
'計算' => 1
'訪問' => 3
'詳しい' => 1
'誤差' => 1
'説明' => 1
'調べ' => 1
'距離' => 1
'近い' => 1
'近似' => 1
'送っ' => 1
'途中' => 1
'重複' => 1
'関数' => 1
'閲覧' => 2
'面倒' => 1
'面白' => 1
'面白く' => 1
'!' => 4
'ry' => 1