1 - extract obj files
  ../ieee754/extract < ../Buyuk.pdf
2 - get pages
  cat 00003.obj | cut -f 2 -d '['|cut -f 1 -d ']'|sed -e "s/ 0 R//g" |sed -e "s/ //"|tr -s ' ' '\n' > PAGES
3 - get data file of page
  for i in `cat PAGES`; do sed -e "s/.*Contents //" `printf "%5.5d" $i`.obj |cut -f 1 -d ' '; done > PAGES_DATA
4 - get length index (in fact object, not index) of compressed data
  for i in `cat PAGES_DATA`; do head -n 1 `printf "%5.5d" $i`.obj | sed -e "s/.*Length //"| cut -f 1 -d ' '; done > PAGES_LENGTH_INDEX
5 - get length of compressed data
  for i in `cat PAGES_LENGTH_INDEX`; do cat `printf "%5.5d" $i`.obj | tr -s '\015' ' '|cut -f 4 -d ' '; done > PAGES_LENGTH
6 - build unzip script
  cat > unz.c << EOF
#include <stdio.h>
#include <stdlib.h>

int main(int n, char **v)
{
  FILE *f1, *f2;
  int data, size;
  f1 = fopen(v[1], "r"); if (!f1) abort();
  f2 = fopen(v[2], "r"); if (!f2) abort();
  while (1) {
    if (fscanf(f1, "%d", &data) != 1) break;
    if (fscanf(f2, "%d", &size) != 1) abort();
    printf("../ieee754/deflate %d < %5.5d.obj | tr -s '\\\015' '\\\n' > %5.5d.txt\n", size, data, data);
  }
  return 0;
}
EOF
  gcc -o unz unz.c -Wall
  ./unz PAGES_DATA PAGES_LENGTH > unzip.sh
  chmod +x unzip.sh
7 - unzip
  ./unzip.sh
8 - get text data
  for i in `cat PAGES_DATA`; do grep -e "Tf$" -e "Tm$" -e "Tj$" -e "TJ$" `printf "%5.5d" $i`.txt; echo XXX; done > FULL_TXT
9 - reconstruct text
  cd ..
  gcc retext.c -o retext -Wall -O3
  cd z
  ../retext < FULL_TXT > full.txt
10 - extract words
  grep "^WORD " full.txt | sed -e "s/^WORD //"|grep -v ' ' > dict.txt
11 - cleanup
remove non-words:
-by hand:
  mü
-automatically with online bts
----
font F3 (00008.obj):
from 12136.obj:
6 beginbfchar
<00d6> <0131>
<00f7> <011e>
<00f8> <011f>
<00f9> <0130>
<00fa> <015e>
<00fb> <015f>
endbfchar
raw font is in 12139.obj
to extract it:
./ieee754/deflate 12304 < z/12139.obj > z/F3.font
and edit file to remove first line and stuff at the end
then to explore it, use ttf.c
gcc -g -Wall ttf.c -o ttf -I /usr/include/freetype2 -lfreetype
we find that:
<00d6> -> ı (advance 469 width 453)
<00f7> -> Äž (advance 1579 width 1458)
<00f8> -> ÄŸ (advance 917 width 931)
<00f9> -> Ä° (advance 725 width 621)
<00fa> -> Åž (advance 981 width 818)
<00fb> -> ÅŸ (advance 747 width 546)

to use advance from the font, you must do: val * 1000 / 2048 to
go into text space (in reality text space * 1000)
so:
d6 -> 469 -> 229
f7 -> 1579 -> 771
f8 -> 917 -> 448
f9 -> 725 -> 354
fa -> 981 -> 479
fb -> 747 -> 365
(gdb) p *ft_face
$2 = {num_faces = 1, face_index = 0, face_flags = 2585, style_flags = 0, 
  num_glyphs = 663, family_name = 0x61c240 "Garamond", 
  style_name = 0x61c280 "Regular", num_fixed_sizes = 0, available_sizes = 0x0, 
  num_charmaps = 0, charmaps = 0x0, generic = {data = 0x0, finalizer = 0x0}, 
  bbox = {xMin = -284, yMin = -628, xMax = 2177, yMax = 2020}, 
  units_per_EM = 2048, ascender = 1765, descender = -539, height = 2304, 
  max_advance_width = 2275, max_advance_height = 2304, underline_position = 0, 
  underline_thickness = 0, glyph = 0x61c730, size = 0x61c980, charmap = 0x0, 
  driver = 0x6191e0, memory = 0x615010, stream = 0x61a9a0, sizes_list = {
    head = 0x61cba0, tail = 0x61cba0}, autohint = {data = 0x0, 
    finalizer = 0x0}, extensions = 0x0, internal = 0x61af70}