1 - extract obj files ../ieee754/extract < ../Buyuk.pdf 2 - get pages cat 00003.obj | cut -f 2 -d '['|cut -f 1 -d ']'|sed -e "s/ 0 R//g" |sed -e "s/ //"|tr -s ' ' '\n' > PAGES 3 - get data file of page for i in `cat PAGES`; do sed -e "s/.*Contents //" `printf "%5.5d" $i`.obj |cut -f 1 -d ' '; done > PAGES_DATA 4 - get length index (in fact object, not index) of compressed data for i in `cat PAGES_DATA`; do head -n 1 `printf "%5.5d" $i`.obj | sed -e "s/.*Length //"| cut -f 1 -d ' '; done > PAGES_LENGTH_INDEX 5 - get length of compressed data for i in `cat PAGES_LENGTH_INDEX`; do cat `printf "%5.5d" $i`.obj | tr -s '\015' ' '|cut -f 4 -d ' '; done > PAGES_LENGTH 6 - build unzip script cat > unz.c << EOF #include #include int main(int n, char **v) { FILE *f1, *f2; int data, size; f1 = fopen(v[1], "r"); if (!f1) abort(); f2 = fopen(v[2], "r"); if (!f2) abort(); while (1) { if (fscanf(f1, "%d", &data) != 1) break; if (fscanf(f2, "%d", &size) != 1) abort(); printf("../ieee754/deflate %d < %5.5d.obj | tr -s '\\\015' '\\\n' > %5.5d.txt\n", size, data, data); } return 0; } EOF gcc -o unz unz.c -Wall ./unz PAGES_DATA PAGES_LENGTH > unzip.sh chmod +x unzip.sh 7 - unzip ./unzip.sh 8 - get text data for i in `cat PAGES_DATA`; do grep -e "Tf$" -e "Tm$" -e "Tj$" -e "TJ$" `printf "%5.5d" $i`.txt; echo XXX; done > FULL_TXT 9 - reconstruct text cd .. gcc retext.c -o retext -Wall -O3 cd z ../retext < FULL_TXT > full.txt 10 - extract words grep "^WORD " full.txt | sed -e "s/^WORD //"|grep -v ' ' > dict.txt 11 - cleanup remove non-words: -by hand: mü -automatically with online bts ---- font F3 (00008.obj): from 12136.obj: 6 beginbfchar <00d6> <0131> <00f7> <011e> <00f8> <011f> <00f9> <0130> <00fa> <015e> <00fb> <015f> endbfchar raw font is in 12139.obj to extract it: ./ieee754/deflate 12304 < z/12139.obj > z/F3.font and edit file to remove first line and stuff at the end then to explore it, use ttf.c gcc -g -Wall ttf.c -o ttf -I /usr/include/freetype2 -lfreetype we find that: <00d6> -> ı (advance 469 width 453) <00f7> -> Ğ (advance 1579 width 1458) <00f8> -> ğ (advance 917 width 931) <00f9> -> İ (advance 725 width 621) <00fa> -> Ş (advance 981 width 818) <00fb> -> ş (advance 747 width 546) to use advance from the font, you must do: val * 1000 / 2048 to go into text space (in reality text space * 1000) so: d6 -> 469 -> 229 f7 -> 1579 -> 771 f8 -> 917 -> 448 f9 -> 725 -> 354 fa -> 981 -> 479 fb -> 747 -> 365 (gdb) p *ft_face $2 = {num_faces = 1, face_index = 0, face_flags = 2585, style_flags = 0, num_glyphs = 663, family_name = 0x61c240 "Garamond", style_name = 0x61c280 "Regular", num_fixed_sizes = 0, available_sizes = 0x0, num_charmaps = 0, charmaps = 0x0, generic = {data = 0x0, finalizer = 0x0}, bbox = {xMin = -284, yMin = -628, xMax = 2177, yMax = 2020}, units_per_EM = 2048, ascender = 1765, descender = -539, height = 2304, max_advance_width = 2275, max_advance_height = 2304, underline_position = 0, underline_thickness = 0, glyph = 0x61c730, size = 0x61c980, charmap = 0x0, driver = 0x6191e0, memory = 0x615010, stream = 0x61a9a0, sizes_list = { head = 0x61cba0, tail = 0x61cba0}, autohint = {data = 0x0, finalizer = 0x0}, extensions = 0x0, internal = 0x61af70}