#!/bin/bash
#
# Copyright (C) 2003 by Juan F. Codagnone <juam /@/ arnet com ar>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

VERSION="1.0.0"
first=""
last=""
pdftohtml="../../pdftohtml"
function usage()
{	echo "Usage: $0 [-vh] [-f first_page] [-l last_page] guia.pdf"
	echo
	echo "send suggestions to juam  arnet . com . ar"
}

function version()
{	echo "Version $VERSION" 
	cat << EOF

Copyright (C) 2003 Juan F. Codagnone
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
EOF
}

function parseOptions()
{	local opt
	local OPTIND
	local OPTARG
	local OPTERR
	local OPTSTRING
	local t

	while getopts "hvf:l:" opt
	do
		case $opt in
		  h)	usage ; 
			t=$?
			exit  $? ;;
		  v)	version ; 
			t=$?
			exit $?;;
		  f)  	first="-f ${OPTARG}";;
		  l)	last="-l ${OPTARG}";;
		esac
	done

	if [[ -z "$guia" ]]; then
		guia=`eval echo "$"${OPTIND}`
		if [[ -z "$guia" ]]; then
			usage
			exit 1
		fi
	fi
	if [[ ! -a "$guia" ]]; then
		echo "file '$guia' does not exists. aborting"
		exit 1
	fi
}

function main()
{
$pdftohtml $first $last  -stdout $guia |
sed -e  's/»/ @  /g' \
    -e  's/[.]//g'  \
    -e  "s/'/\'/g" \
    -e  's/<br>//g' \
    -e  's/<A name=/<A name= /g' \
    -e  's/><\/a>/ ><\/a> \
/g' |
sed  -e 's/>/> /g' -e 's/</ </g'  |
# remove html headers 
awk '
{
	if( p  )
		print 
	if( $1=="<BODY" )
		p=1;

}' | 
# showtime
awk '

function isBold(str)
{
	return str ~ "^<(/|)(b|B)>$"
}


function istelnum(str)
{
	return str ~ "^[[:digit:]]*-[[:digit:]]*$"
}

function isupper(str)
{
	return  !istelnum(str) && str==toupper(str)
}

function printreg()
{
	if( surname=="" || street=="" || streetnum=="" || tel=="" )
	{	print "ERROR " page "|" surname "|" name "|" street "|" \
		      streetnum "|" tel >> "errors"
		fflush("errors")
	}
	else
	{	l = substr(surname,0,1)
		if( file=="" ||  l != file  )
			printf("") > l
		file=l
		print page "|" surname "|" name "|" street "|" streetnum "|" \
		      tel >> file
	}
	lastsurname=surname
	surname=""
	name=""
	tel=""
	street=""
	streetnum=""
	nr++;
}

# ignore html trailing headers
$0 ~ "^[[:space:]]*</(HTML|BODY)>[[:space:]]*$" { next ; }


BEGIN {
	print "" > "errors"
	nr=0
}
END {
	printreg()
	print "Page " page ": " nr - lastnr   " records(total " nr ")"   
}
# detect a new page 
$0 ~ "^[[:space:]]*[<]A name=" { 
	page=$3 
	if( page-1 != 0 )
		print "Page " page-1 ": " nr - lastnr   " records(total " nr ")"   
	lastnr=nr
	next 
}

# ignore cover 
page == 1 { next } 

# else
{	
	if ( $0~"^ <b> [[:alpha:]]+-[[:alpha:]]+[[:space:]]*</b>[[:space:]]*$" )
	{	getline;
		getline;
	}
	else if( isBold($1) && istelnum($2) && isBold($3) )
	{	tel=$2
		state = 0;
		printreg()
	}
	else if( isBold($1) && $2=="@" )
	{	if( surname == "" )
			surname=lastsurname
		for(i=3; i<NF ; i++ )	
		{	if( name=="" )
				name=$i
			else
				name=sprintf("%s %s",name,$i);
		}
		state=2
	}
	else if ( !isBold($1) && !isBold($NF) )
	{
		for( i=1; i < NF ; i++ )
		{	if( street=="" )
				street=$i
			else
				street=sprintf("%s %s",street,$i);
		}
		streetnum=$NF
	}
	else if( state==0 )
	{	if( isBold($1) && isBold($NF) && isupper($2) )
		{	lastsurname=surname
			name=""
			surname=""
			tel=""
			for( i=2 ; i<NF && isupper($i) ; i++ )
			{	if( surname=="" )
					surname=$i
				else
					surname=sprintf("%s %s",surname, $i);
			}
			if( surname=="" )
				surname=lastsurname;
				
			if( i != NF )
			{	for( ; i<NF ; i++ )
				{	if( name=="" )
						name=$i;
					else
						name=sprintf("%s %s",name,$i);
				}
				state=2
			}
			else
				state = 1;
		}
	}
}'
}

parseOptions $*
main

