source:
lab.git/Dev/pukiwikiplus/getsource.pl
@
b804ca6
Last change on this file since b804ca6 was e3948e3, checked in by mitty <mitty@…>, 14 years ago | |
---|---|
|
|
File size: 1.1 KB |
Rev | Line | |
---|---|---|
[74f778c] | 1 | #! /usr/bin/perl |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | use utf8; | |
6 | ||
7 | use Web::Scraper; | |
8 | use URI; | |
9 | use Data::Validate::URI qw(is_uri); | |
10 | use Encode; | |
11 | ||
[e3948e3] | 12 | ## get wiki formatted source text from PukiWiki Plus! page |
13 | # '-f' option to save file automatically | |
14 | ||
[74f778c] | 15 | if (@ARGV < 1) { |
[a1439aa] | 16 | print "$0: [-f] url_to_pukiwikiplus_page\n"; |
[74f778c] | 17 | exit; |
18 | } | |
19 | ||
[a1439aa] | 20 | my $file; |
21 | if ($ARGV[0] eq '-f') { | |
22 | $file = shift @ARGV; | |
23 | } | |
24 | ||
[74f778c] | 25 | my $url = $ARGV[0]; |
26 | die "$0: '$url' is not URI" unless is_uri($url); | |
27 | ||
28 | if ($url !~ /cmd=source/ ) { | |
[e3948e3] | 29 | # set url to page of source plugin page (source.inc.php) |
[74f778c] | 30 | $url =~ s#\?(.+)#\?cmd=source&page=$1#; |
[a1439aa] | 31 | if ($file) { |
32 | $file = wikifile($1) . '.txt'; | |
33 | } | |
[74f778c] | 34 | } |
35 | ||
36 | print STDERR "getting: '$url'\n"; | |
37 | ||
38 | my $source = scraper { | |
[e3948e3] | 39 | # scrape page with '<pre id="source">' tag |
[74f778c] | 40 | process "pre#source", "source" => "TEXT"; |
41 | result "source"; | |
42 | }; | |
43 | ||
44 | my $text = $source->scrape( URI->new($url) ); | |
45 | ||
[a1439aa] | 46 | if ($file) { |
47 | print STDERR "saving: '$file'\n"; | |
48 | open STDOUT, ">$file"; | |
49 | } | |
[74f778c] | 50 | print encode('utf8', $text); |
[a1439aa] | 51 | |
52 | ||
53 | sub wikifile { | |
[e3948e3] | 54 | # get wiki text filename from pagename |
[a1439aa] | 55 | my $pagename = shift @_; |
56 | ||
57 | $pagename =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack('H2', $1)/eg; | |
58 | return uc unpack("H*", $pagename); | |
59 | } |
Note: See TracBrowser
for help on using the repository browser.