#!/bin/bash
# Karsten M. Self
# Thu Dec 13 17:11:09 PST 2007
#
# A Mac OS X iWork Pages to Text 
# Requires filename as argument
#
# Developed on Debian GNU/Linux, may not work on other 'Nixen given
# arguments, but should.

if [ ${#} -lt 1 ]; then 
    echo "Please supply a filename"
    exit 1
fi

if [ ! -f $1 ]; then
    echo "$1: file does not exist"
    exit 1
fi

INDEXFILE=$( echo $1 | sed -e 's/\.zip$/\/index.xml.gz/' )

# File format:  it's a ZIP archive, payload is
# "<filename-less-ZIP-extension>/index.xml.gz". 
#
# Suck that out, nuke everything to the first "page-start" tag, insert
# linefeeds for paragraph tags, nuke all tags, and post-process through
# 'cat -s' to strip extraneous linefeeds.  fmt to 78 columns.

unzip -p $1 $INDEXFILE | zcat |
    sed -e 's/</\n&/g' |
    sed -e '1,/^<sf:page-start/d' |
    sed -e ' 
	s/<sf:p sf:style="paragraph-style-[^>]*">/\n/g
	s/<[^>]*>//g
	' | 
    cat -s |
    fmt -u -w 78
