Home
Asymmetrical [entries|archive|friends|userinfo]
kyle_burton

[ website | My Website ]
[ userinfo | livejournal userinfo ]
[ archive | journal archive ]

Landmark Parsing [Feb. 29th, 2008|12:22 am]
[Tags|]

Someone asked for an example so I dug up my jspwiki tool. Here is the guts of the parser:


sub makeParser {
  my($data) = @_;
  my $pos = 0;
  my $setData = sub { $data = $_[0]; $pos = 0; };
  my $start   = sub { $pos = 0 };
  my $fwd     = sub { return -1 if $pos == -1; $pos += $_[0]; $pos = -1 if $pos >= length($data); $pos };
  my $bck     = sub { return -1 if $pos == -1; $pos -= $_[0]; $pos = -1 if $pos < 0;              $pos };
  my $bckTo   = sub { return -1 if $pos == -1; $pos = rindex $data, $_[0], $pos; };
  my $fwdTo   = sub { return -1 if $pos == -1; $pos =  index $data, $_[0], $pos; };
  my $fwdPast = sub {
    return $pos if $pos == -1;
    $pos = index $data, $_[0], $pos;
    return $pos if $pos == -1;
    $pos += length($_[0]);
    $pos >= length($data) ? $pos = -1 : $pos;
  };
  my $btwn = sub {
    return -1 if $pos == -1;
    my $s = $fwdPast->($_[0]);
    return undef if $s == -1;
    my $e = $fwdTo->($_[1]);
    return undef if $e == -1;
    my $item = substr $data, $s, $e - $s;
    return $item;
  };

  my $all = sub {
    my @all;
    while (-1 != $pos) {
      my $item = $btwn->(@_);
      last unless $item;
      push @all, $item;
    }
    return @all;
  };

  return ($setData,$start,$fwd,$bck,$fwdTo,$fwdPast,$bckTo,$btwn,$all);
}

and here is how it gets used:

sub getPageInfo {
  my($topic) = @_;
  my $data = $UserAgent->get("$BaseURL/PageInfo.jsp?page=$topic")->content;
  print "$data\n";
  my($setData,$start,$fwd,$bck,$fwdTo,$fwdPast,$bckTo,$btwn,$all) = makeParser($data);
  my $table = $btwn->('Version','</table>');

  ($setData,$start,$fwd,$bck,$fwdTo,$fwdPast,$bckTo,$btwn,$all) = makeParser($table);
  print join("\t",qw(Version Date Author Size Changes from Previous)),"\n";
  dumpRow($_) for $all->('<tr>','</tr>');
}

sub dumpRow {
  my($row) = @_;
  my($setData,$start,$fwd,$bck,$fwdTo,$fwdPast,$bckTo,$btwn,$all) = makeParser($row);
    print join("\t",map { simpleStrip($_) } $all->('<td>','</td>')),"\n";
}

link1 comment|post comment

navigation
[ viewing | most recent entries ]

Advertisement