* crawler for Nikkei editorial articles

[lab.git] / misc / nikkei.pl
diff --git a/misc/nikkei.pl b/misc/nikkei.pl

new file mode 100644 (file)

index 0000000..1df153d
--- /dev/null
+++ b/misc/nikkei.pl
@@ -0,0 +1,37 @@
+#! /usr/bin/env perl -w
+
+use strict;
+use warnings;
+use utf8;
+
+use LWP::Simple;
+binmode STDOUT => 'encoding(utf8)';
+
+my $directory = shift @ARGV || "./";
+my $nikkei_url = 'http://www.nikkei.com/news/editorial/';
+my $nikkei     = 'http://www.nikkei.com';
+
+my $regex = 'href="([^"]+)(DGXDZO\w+000)/';
+my $javascript = '<script .*?</script>';
+
+my $content = get($nikkei_url);
+while ($content =~ /$regex/g) {
+    my $article = "$nikkei$1$2/";
+    my $file = "$directory/$2.html";
+    
+    if (-f "$file") { next; }
+    
+    system("wget", "-q", $article, '-O', "$file");
+    sleep 1;
+    
+    if (-f "$file") {
+        open my $html, "<", $file;
+        local $/;
+        my $body = <$html>;
+        
+        $body =~ s/$javascript//g;
+        
+        open $html, ">", $file;
+        print $html $body;
+    }
+}