Last change
on this file since 121 was
121,
checked in by mitty, 13 years ago
|
- accept -f option to save wiki text to file
|
File size:
921 bytes
|
Rev | Line | |
---|
[120] | 1 | #! /usr/bin/perl |
---|
| 2 | |
---|
| 3 | use strict; |
---|
| 4 | use warnings; |
---|
| 5 | use utf8; |
---|
| 6 | |
---|
| 7 | use Web::Scraper; |
---|
| 8 | use URI; |
---|
| 9 | use Data::Validate::URI qw(is_uri); |
---|
| 10 | use Encode; |
---|
| 11 | |
---|
| 12 | if (@ARGV < 1) { |
---|
[121] | 13 | print "$0: [-f] url_to_pukiwikiplus_page\n"; |
---|
[120] | 14 | exit; |
---|
| 15 | } |
---|
| 16 | |
---|
[121] | 17 | my $file; |
---|
| 18 | if ($ARGV[0] eq '-f') { |
---|
| 19 | $file = shift @ARGV; |
---|
| 20 | } |
---|
| 21 | |
---|
[120] | 22 | my $url = $ARGV[0]; |
---|
| 23 | die "$0: '$url' is not URI" unless is_uri($url); |
---|
| 24 | |
---|
| 25 | if ($url !~ /cmd=source/ ) { |
---|
| 26 | $url =~ s#\?(.+)#\?cmd=source&page=$1#; |
---|
[121] | 27 | if ($file) { |
---|
| 28 | $file = wikifile($1) . '.txt'; |
---|
| 29 | } |
---|
[120] | 30 | } |
---|
| 31 | |
---|
| 32 | print STDERR "getting: '$url'\n"; |
---|
| 33 | |
---|
| 34 | my $source = scraper { |
---|
| 35 | process "pre#source", "source" => "TEXT"; |
---|
| 36 | result "source"; |
---|
| 37 | }; |
---|
| 38 | |
---|
| 39 | my $text = $source->scrape( URI->new($url) ); |
---|
| 40 | |
---|
[121] | 41 | if ($file) { |
---|
| 42 | print STDERR "saving: '$file'\n"; |
---|
| 43 | open STDOUT, ">$file"; |
---|
| 44 | } |
---|
[120] | 45 | print encode('utf8', $text); |
---|
[121] | 46 | |
---|
| 47 | |
---|
| 48 | sub wikifile { |
---|
| 49 | my $pagename = shift @_; |
---|
| 50 | |
---|
| 51 | $pagename =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack('H2', $1)/eg; |
---|
| 52 | return uc unpack("H*", $pagename); |
---|
| 53 | } |
---|
Note: See
TracBrowser
for help on using the repository browser.