trunk
Line | |
---|
1 | #! /usr/bin/perl |
---|
2 | |
---|
3 | use strict; |
---|
4 | use warnings; |
---|
5 | use utf8; |
---|
6 | |
---|
7 | use Web::Scraper; |
---|
8 | use URI; |
---|
9 | use Data::Validate::URI qw(is_uri); |
---|
10 | use Encode; |
---|
11 | |
---|
12 | if (@ARGV < 1) { |
---|
13 | print "$0: [-f] url_to_pukiwikiplus_page\n"; |
---|
14 | exit; |
---|
15 | } |
---|
16 | |
---|
17 | my $file; |
---|
18 | if ($ARGV[0] eq '-f') { |
---|
19 | $file = shift @ARGV; |
---|
20 | } |
---|
21 | |
---|
22 | my $url = $ARGV[0]; |
---|
23 | die "$0: '$url' is not URI" unless is_uri($url); |
---|
24 | |
---|
25 | if ($url !~ /cmd=source/ ) { |
---|
26 | $url =~ s#\?(.+)#\?cmd=source&page=$1#; |
---|
27 | if ($file) { |
---|
28 | $file = wikifile($1) . '.txt'; |
---|
29 | } |
---|
30 | } |
---|
31 | |
---|
32 | print STDERR "getting: '$url'\n"; |
---|
33 | |
---|
34 | my $source = scraper { |
---|
35 | process "pre#source", "source" => "TEXT"; |
---|
36 | result "source"; |
---|
37 | }; |
---|
38 | |
---|
39 | my $text = $source->scrape( URI->new($url) ); |
---|
40 | |
---|
41 | if ($file) { |
---|
42 | print STDERR "saving: '$file'\n"; |
---|
43 | open STDOUT, ">$file"; |
---|
44 | } |
---|
45 | print encode('utf8', $text); |
---|
46 | |
---|
47 | |
---|
48 | sub wikifile { |
---|
49 | my $pagename = shift @_; |
---|
50 | |
---|
51 | $pagename =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack('H2', $1)/eg; |
---|
52 | return uc unpack("H*", $pagename); |
---|
53 | } |
---|
Note: See
TracBrowser
for help on using the repository browser.