#!/usr/bin/perl -w use strict; ## digioPDF2text ## with pdftotext 1.00 (included in Xpdf 1.00) ## tested with RedHatLinux7.3 (perl-5.6.1, pdftotext1.00, wget1.8.1) ## ## Time-stamp: "Jan 25 2003" ## author: itouh my @channel = qw(403 446 473); mainfunc(); exit(); sub mainfunc { my $issueday = getMondayOfThisWeek(); foreach my $ch (@channel){ my @day = sort glob("$ch-*.txt"); if($#day >= 0){ my ($d) = $day[$#day] =~ /^$ch\-([0-9\-]*)\.txt$/; if($issueday eq $d){ ## 既に取得済み print " $ch-$d.txt is already there, will not retrieve.\n"; next; } } my $pdffile = getPDF($ch); pdf2text($pdffile, "$ch-$issueday.txt"); } } sub getMondayOfThisWeek { my $t = time(); ## am0:00〜3:59は昨日のプログラムを放送中なので、4時間引く if ((localtime($t))[2] < 4){ $t -= 4*60*60; } ## 月曜になるまで 24時間ずつ引く while((localtime($t))[6] != 1){ $t -= 24*60*60; } ## 月曜の日付を返す my ($d, $m, $y) = (localtime($t))[3..5]; return sprintf("%04d-%02d%02d", $y+1900, $m+1, $d); } sub getPDF { my ($ch) = @_; if(-s "$ch.pdf"){ unlink("$ch.pdf"); } system("wget http://www.stardigio.com/weekly/proglist/$ch.pdf") && die "cannot get $ch.pdf!"; return "$ch.pdf"; } sub pdf2text { my ($pdffile, $txtfile) = @_; my $tmpfile = "tmp.txt"; if(-s $tmpfile){ unlink($tmpfile); } system("pdftotext -enc EUC-JP -raw $pdffile $tmpfile") && die "cannot read $pdffile!"; open(IN,$tmpfile) or die "cannot read $tmpfile!"; open(OUT,">$txtfile") or die "cannot write $txtfile!"; while(){ chomp(); if(/^ /){ next; } ## 空白をある程度削って、等幅フォント環境で演奏者名を揃える my $buf = $_; $buf =~ s/ .*$//; #曲タイトルだけ抽出 my $zen = $buf =~ tr/\x80-\xfe//; #曲タイトルが含む全角文字の数(の2倍) if($zen > 0){ my $rmsp = " "x($zen / 2); #remove spaces $_ =~ s/$rmsp//; } # if(/^ / or /^ /){ # ## 行頭に半角/全角空白がある場合、前の行から改行しない # print OUT "$_"; # }else{ # print OUT "\n$_"; # } print OUT "\n$_"; } print OUT "\n"; close(OUT); close(IN); } ## end of file