#!/usr/bin/perl

#
# read_sfr.pl: PDF中の表をcsvにするためのツール
#                                     by T-semi
# このスクリプトについての著作権を主張する気はありません。
# 好き勝手に使ってください。
# 正しく動く保証をする気もありません。
# どのように使っても構いませんが、
# どんな不利益があっても私は責任をとりません。
#

use warnings;
use strict;
use utf8;
use XML::Simple;
use Data::Dumper;

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

if(@ARGV != 3){
    print STDERR "コマンドライン引数(順に): ";
    print STDERR "ファイル名 開始ページ 終了ページ\n";
    exit(1);
}

my $file = shift @ARGV;
my $from = shift @ARGV;
my $to = shift @ARGV;

sub ifcol{
    my $min1 = shift;
    my $max1 = shift;

    my $min2 = shift;
    my $max2 = shift;

    if($min2 <= $min1 && $min1 <= $max2 ||
       $min2 <= $max1 && $max1 <= $max2 ||
       $min1 <= $min2 && $min2 <= $max1){
        return 1;
    }else{
        return 0;
    }
}

sub mcol{
    my $min1 = shift;
    my $max1 = shift;

    my $min2 = shift;
    my $max2 = shift;

    if($min2 < $min1){
        $min1 = $min2;
    }
    if($max2 > $max1){
        $max1 = $max2;
    }
    return ($min1, $max1);
}

sub treep{
    my @ret = (shift);
    my @loc = @_;

    for my $np(@loc){
        my @pret = @ret;
        @ret = ();
        for(@pret){
            my $c = $_->{$np};
            if(ref($c) eq "ARRAY"){
                push(@ret, @$c);
            }else{
                push(@ret, $c);
            }
        }
    }

    return @ret;
}

sub treear{
    my $in = shift;

    my @ta = ([keys(%$in)]);    #解析中位置にたどり着くまでのkeyやindex
    $in->{name} = "_root";
    my @tk = ("_root");         #解析中位置までのハッシュkey
    my @tt = ($in);             #解析中位置までたどり着くまでのref
    while(1){
        my $k = pop(@{$ta[$#ta]});
        my $nex;
        if(ref($tt[$#tt]) eq "ARRAY"){
            $nex = $tt[$#tt][$k];
        }else{
            $nex = $tt[$#tt]{$k};
        }
        if(ref($nex) eq "ARRAY"){
            push(@ta, [0 .. $#$nex]);
        }elsif(ref($nex) eq "HASH"){
            push(@ta, [keys(%$nex)]);
            if(ref($tt[$#tt]) eq "ARRAY"){
                $nex->{"_index"} = $k;
                $nex->{"_name"} = $tk[$#tk];
                $nex->{"_parent"} = $tk[$#tk-1];
            }else{
                $nex->{"_name"} = $k;
                $nex->{"_parent"} = $tk[$#tk];
            }
            $nex->{"_path"} = join("/", @tk, $k);
            if($#tt >= 0){
                $nex->{"_pp"} = $tt[$#tt];
                if(ref($nex->{"_pp"}) ne "HASH"){
                    if($#tt > 0){
                        $nex->{"_ph"} = $tt[$#tt-1];
                    }
                }else{
                    $nex->{"_ph"} = $tt[$#tt];
                }
            }
        }
        if(ref($nex)){
            push(@tt, $nex);
            push(@tk, $k);
        }
        while(!@{$ta[$#ta]}){
            pop(@ta);
            if(!@ta){
                return;
            }
            pop(@tt);
            pop(@tk);
        }
    }
}

sub merge_ranges{
    my $i;

    my @ret = sort{$a->[0] <=> $b->[0]}(@_);
    for($i = 0; $i < $#ret; $i++){
        if(ifcol(@{$ret[$i]}, @{$ret[$i+1]})){
            @{$ret[$i]} = mcol(@{$ret[$i]}, @{$ret[$i+1]});
            splice(@ret, $i+1, 1);
            if($i < $#ret){
                redo;
            }
        }
    }
    return @ret;
}

sub rangematch{
    my $axis = shift;
    my $range = shift;
    my @ret;
    for(@_){
        if(ifcol(@$range, $_->{"${axis}Min"}, $_->{"${axis}Max"})){
            push(@ret, $_);
        }
    }
    return @ret;
}

sub textmerge{
    my $ret = "";
    for(
        sort{
            abs($a->{yMax}+$a->{yMin}-$b->{yMax}-$b->{yMin})/2
            <($a->{yMax}-$a->{yMin}+$b->{yMax}-$b->{yMin})/4*0.7?
                $a->{xMin}<=>$b->{xMin} : $a->{yMin}<=>$b->{yMin}
        }(@_)
    ){
        $ret .= $_->{content};
    }
    return $ret;
}

my $i;
my $j;

my @pages;

for($from .. $to){
    my $i;
    my $mst;
    my @ecmd = ("pdftotext", "-bbox-layout", "-f", $_, "-l", $_, $file, "-");
    print STDERR join(" ", @ecmd)."\n";

    open($mst, "-|:utf8", @ecmd);
    my $ll;
    while(<$mst>){
        $ll .= $_;
    }
    close($mst);

    my $indat = XMLin($ll);

    #treear($indat);

    my @tliners;
    my @words = treep($indat, qw(body doc page flow block line word));

    for(@words){
        my $i;
        push(@tliners, [$_->{"yMin"}, $_->{"yMax"}]);
    }
    @tliners = merge_ranges(@tliners);

    my @tlinetexts;
    my @tlineref;
    for(@words){
        my $i;
        my @yr = ($_->{"yMin"}, $_->{"yMax"});
        for($i = 0; $i < @tliners; $i++){
            if(ifcol(@{$tliners[$i]}, @yr)){
                if(!defined($tlinetexts[$i])){
                    $tlinetexts[$i] = [];
                }
                push(@{$tlinetexts[$i]}, $_->{content});
                push(@{$tlineref[$i]}, $_);
                last;
            }
        }
    }

    push(@pages, {text=>[@tlinetexts], pos=>[@tliners], refs=>[@tlineref]});
}

my @rowmgs;
my @colsplit;
my @colcenter;
my @coltitles;

for($i = 0; $i < 2 && $i < @pages; $i++){
    my $j;
    for($j = $#{$pages[$i]{text}}; $j >= 0; $j--){
        print STDERR "$j: ";
        print STDERR join("/", @{$pages[$i]{text}[$j]});
        print STDERR "\n";
    }
    print STDERR "表の列の名前の行の番号を入れて: ";
    my $startl = <STDIN>;

    if(!@colsplit){
        @coltitles = @{$pages[$i]{text}[$startl]};
        for(@{$pages[$i]{refs}[$startl]}){
            push(@colsplit, [$_->{"xMin"}, $_->{"xMax"}]);
        }
        @colsplit = merge_ranges(@colsplit);
    }

    for($j = 0; $j < @{$pages[$i]{text}}; $j++){
        print STDERR "$j: ";
        print STDERR join("/", @{$pages[$i]{text}[$j]});
        print STDERR "\n";
    }
    print STDERR "表の一番したの行の番号を入れて: ";
    my $endl = <STDIN>;

    for($j = $startl+1; $j < $endl; $j++){
        push(@rowmgs, $pages[$i]{pos}[$j+1][0] - $pages[$i]{pos}[$j][1]);
    }
}

@rowmgs = sort{$a<=>$b}(@rowmgs);
my $typical_rowmg = $rowmgs[$#rowmgs/2];

my $rowmg_min = $typical_rowmg * 0.5;
my $rowmg_max = $typical_rowmg * 1.5;

my @telems;

for($i = 0; $i < @pages; $i++){
    my $lastbottom;
    my $j;
    push(@telems, []);
    for($j = 0; $j < @{$pages[$i]{text}}; $j++){
        my $mc = 0;
        for my $n(@coltitles){
            if(grep{$n eq $_}(@{$pages[$i]{text}[$j]})){
                $mc++;
            }
        }
        if($mc >= @coltitles * 0.7){
            $lastbottom = $pages[$i]{pos}[$j][1];
            last;
        }
    }
    $j++;
    if($j >= @{$pages[$i]{pos}}){
        print STDERR "page $i skipped.\n";
    }
    for(; $j < @{$pages[$i]{pos}}; $j++){
        if($lastbottom + $rowmg_max < $pages[$i]{pos}[$j][0]){
            last;
        }else{
            for(@{$pages[$i]{refs}[$j]}){
                push(@colsplit, [$_->{"xMin"}, $_->{"xMax"}]);
            }
            push(@{$telems[$#telems]}, @{$pages[$i]{refs}[$j]});
        }
        $lastbottom = $pages[$i]{pos}[$j][1];
    }
}
@colsplit = merge_ranges(@colsplit);

for($i = 0; $i < @{$pages[0]{text}}; $i++){
    my $mc = 0;
    for my $n(@coltitles){
        if(grep{$n eq $_}(@{$pages[0]{text}[$i]})){
            $mc++;
        }
    }
    if($mc >= @coltitles * 0.7){
        last;
    }
}

if($i > $#{$pages[0]{text}}){
    die;
}

@coltitles = ();

for($j = 0; $j < @colsplit; $j++){
    my $title = textmerge(
                    rangematch("x", $colsplit[$j], @{$pages[0]{refs}[$i]}));
    push(@coltitles, $title);
    print STDERR "$j: $title\n";
}
print STDERR "どの列を行の判定基準とする? ";
my $basecol = <STDIN>;

my @tdata;
for my $pelems(@telems){
    my $i;
    my @row_yranges;
    for my $be(rangematch("x", $colsplit[$basecol], @$pelems)){
        push(@row_yranges, [$be->{yMin}, $be->{yMax}]);
    }
    @row_yranges = merge_ranges(@row_yranges);

    for($i = 0; $i < $#row_yranges; $i++){
        $row_yranges[$i][1] = $row_yranges[$i+1][0] - $rowmg_min;
    }
    $row_yranges[$#row_yranges][1] = ~0;

    for(@row_yranges){
        my @row_elems = rangematch("y", $_, @$pelems);
        my @row;
        for(@colsplit){
            push(@row, textmerge(rangematch("x", $_, @row_elems)));
        }
        push(@tdata, [@row]);
    }
}

print join(",", @coltitles)."\n";
for(@tdata){
    print join(",", @$_)."\n";
}

