Line | |
---|
1 | #! /usr/bin/perl |
---|
2 | |
---|
3 | use strict; |
---|
4 | use warnings; |
---|
5 | use utf8; |
---|
6 | |
---|
7 | use Web::Scraper; |
---|
8 | use URI; |
---|
9 | use Data::Validate::URI qw(is_uri); |
---|
10 | use Encode; |
---|
11 | |
---|
12 | ## get wiki formatted source text from PukiWiki Plus! page |
---|
13 | # '-f' option to save file automatically |
---|
14 | |
---|
15 | if (@ARGV < 1) { |
---|
16 | print "$0: [-f] url_to_pukiwikiplus_page\n"; |
---|
17 | exit; |
---|
18 | } |
---|
19 | |
---|
20 | my $file; |
---|
21 | if ($ARGV[0] eq '-f') { |
---|
22 | $file = shift @ARGV; |
---|
23 | } |
---|
24 | |
---|
25 | my $url = $ARGV[0]; |
---|
26 | die "$0: '$url' is not URI" unless is_uri($url); |
---|
27 | |
---|
28 | if ($url !~ /cmd=source/ ) { |
---|
29 | # set url to page of source plugin page (source.inc.php) |
---|
30 | $url =~ s#\?(.+)#\?cmd=source&page=$1#; |
---|
31 | if ($file) { |
---|
32 | $file = wikifile($1) . '.txt'; |
---|
33 | } |
---|
34 | } |
---|
35 | |
---|
36 | print STDERR "getting: '$url'\n"; |
---|
37 | |
---|
38 | my $source = scraper { |
---|
39 | # scrape page with '<pre id="source">' tag |
---|
40 | process "pre#source", "source" => "TEXT"; |
---|
41 | result "source"; |
---|
42 | }; |
---|
43 | |
---|
44 | my $text = $source->scrape( URI->new($url) ); |
---|
45 | |
---|
46 | if ($file) { |
---|
47 | print STDERR "saving: '$file'\n"; |
---|
48 | open STDOUT, ">$file"; |
---|
49 | } |
---|
50 | print encode('utf8', $text); |
---|
51 | |
---|
52 | |
---|
53 | sub wikifile { |
---|
54 | # get wiki text filename from pagename |
---|
55 | my $pagename = shift @_; |
---|
56 | |
---|
57 | $pagename =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack('H2', $1)/eg; |
---|
58 | return uc unpack("H*", $pagename); |
---|
59 | } |
---|
Note: See
TracBrowser
for help on using the repository browser.