DUMP

/* * xpress2html : extract texts from a Quark Xpress file * and saves it in an html file, encoded with UTF-8. * (only tested with versions 3.3 and 4.0 for Mac; it * probably won't work with something else, just try...) * * xpress2html is brutal. It will scan all the * blocs of the XPress file and 'guess' if they * are blocs describing texts or not. * It may fail in the process, you may get * in the html file some data which is not real * text. But the probability is low, I think. * xpress2html should extract ALL the texts * from the file, as far as the format doesn't * change too much. * * Texts will be extracted in the order they * appear in the file, which probably will be * different from what you have in the XPress * output. Hey, I didn't create the file * format, I'm not responsible for this. * * All formatting information will be lost. * It's tricky to extract this. A font * named "myfont", how do you know if * the text will be italic, bold, and * the like ? xpress2html simply doesn't * handle this, unfornately. * * xpress2html does only handle Mac files. * Who uses XPress with Windows by the way ? * The character set is supposed to be * Mac Roman. * * xpress2html only works under linux. * One may modify it to let it work under * other systems. Ask me some help if * you really need it. * * latest version available at: * http://sed.free.fr/xpress2html.c * * Released in the public domain. * Freedom hates proprietary formats. * Freedom hates licenses. * * This program would simply not exist if Frans Faase * didn't hack those XPress files for something like * two years. Special greetings must fly to him. * See his website: * http://home.planet.nl/~faase009/QX.html * * Quark is going to lose the game. Keep your * file format secret, who cares. At some time * in the future, someone somewhere will write * from scratch an Xpress like program, based * on free technologies, available for everyone, * easy to use, friendly, all what one may dream * about. And it will be free software. * * In the meantime, people still use XPress, and * need to extract data from THEIR files, and * don't want to waste their money on accessing * THEIR own data. So, people with some hacking * skills will produce little tools like this one. * * Release info: * Fri, 30 May 2003 18:28:41 +0200 - starting some hach * Sun, 1 Jun 2003 22:31:34 +0200 - beta version * * Hacked by Sed. * You may contact me at: sed@free.fr * * Sorry for my bad english. * */ #include #include #include #include #include #include #include static char *the_blocks; static int version; #define CHAR_STYLE_START 0x01 #define CHAR_STYLE_END 0x02 typedef struct { unsigned char c; char flag; int char_style; int par_style; } carac; typedef struct { carac *data; int size; int maxsize; } text; typedef struct { text *data; int size; int maxsize; } document; document docs = { data:0, size:0, maxsize:0 }; text *new_text(void) { if (docs.maxsize == docs.size) { docs.data = realloc(docs.data, (docs.maxsize+=64) * sizeof(text)); if (!docs.data) { perror("realloc"); exit(0); } } docs.data[docs.size].data = 0; docs.data[docs.size].size = docs.data[docs.size].maxsize = 0; docs.size++; return &docs.data[docs.size-1]; } void add_text(text *t, char *c, int len) { int i; int ns = len + t->size + 128; if (ns > t->maxsize) { t->data = realloc(t->data, ns * sizeof(carac)); if (!t->data) { perror("realloc"); exit(0); } t->maxsize = ns; } for (i=t->size; isize+len; i++, c++) { t->data[i].c = *c; t->data[i].flag = 0; t->data[i].char_style = t->data[i].par_style = 0; } t->size += len; } long read_bloc_long(unsigned char *p, int b, int pos) { p += b * 256 + pos; return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); } short read_bloc_short(unsigned char *p, int b, int pos) { p += b * 256 + pos; return p[0] | (p[1] << 8); } void mark_bloc(char *p, int bloc, int len) { int i; for (i=bloc; len; i++, len--) the_blocks[i] = 1; } void mark_undata_blocks(char *p, int l) /* l is length of file */ { int bloc = 2; int nb_blocs = l >> 8; int len = 1; /* initial bloc has length 1 bloc */ int next; the_blocks[0] = 1; the_blocks[1] = 1; /* the 2 first blocs are header, then bloc 2 has length 1 bloc */ while (bloc < nb_blocs) { if (bloc + len > nb_blocs) break; mark_bloc(p, bloc, len); /* the next is to the end of current block */ next = read_bloc_long(p, bloc, len * 256 - 4); if (next == 0) return; if (next < 0) { next = -next; next--; if (next >= nb_blocs) break; /* next < 0 => len of bloc is the first 2 bytes of the bloc */ len = read_bloc_short(p, next, 0); bloc = next; continue; } /* next > 0 => len of bloc is 1 */ next--; len = 1; bloc = next; } fprintf(stderr, "bad xpress file\n"); exit(0); } long _read_long(unsigned char *p) { return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); } short _read_short(unsigned char *p) { return p[0] | (p[1] << 8); } /* set by caller to know if the blocs will be marked used or not */ static int move_and_mark; int move_reader(unsigned char *start, int nb_blocs, unsigned char **reader, int *size, int count) { int next; int len; while(count) { count--; if (*reader >= start + nb_blocs*256) return -1; if (*size) { (*size)--; (*reader)++; continue; } next = _read_long(*reader); if (next == 0) return -1; if (next > 0) { next--; if (next >= nb_blocs) return -1; if (the_blocks[next] == 1) return -1; *size = 252; if (*size < count) return -1; *reader = start + next * 256; if (move_and_mark) the_blocks[next] = 1; continue; } /* next < 0 */ next = -next; next--; if (next >= nb_blocs) return -1; if (the_blocks[next] == 1) return -1; *reader = start + next * 256 + 2; if (*reader >= start + nb_blocs * 256) return -1; len = _read_short(*reader - 2); if (len <= 0) return -1; *size = len * 256 - 6; if (move_and_mark) the_blocks[next] = 1; } return 0; } long read_long(int *error, char *start, int nb_blocs, unsigned char **reader, int *size) { unsigned char p3, p2, p1, p0; *error = -1; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p0 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p1 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p2 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p3 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; *error = 0; return p0 | (p1 << 8) | (p2 << 16) | (p3 << 24); } short read_short(int *error, char *start, int nb_blocs, unsigned char **reader, int *size) { unsigned char p1, p0; *error = -1; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p0 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; p1 = **reader; if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0; *error = 0; return p0 | (p1 << 8); } /* return 1 if maybe text bloc, 0 if sure that not */ int is_text(char *p, int nb_blocs, int b, int ok) { int error; int len; unsigned char *reader = p + b * 256; int size = 252; int bloc, bloc_len; int nb_text_blocs; int text_len; int nb_char_style; int nb_par_style; text *t; int pos; if (ok) { the_blocks[b] = 1; t = new_text(); } move_and_mark = ok; /* the pointer must always be on the last read character, so we must do it by hand for the first long */ text_len = len = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (len <= 0) return 0; nb_text_blocs = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (version == 4) { if (nb_text_blocs <= 0 || nb_text_blocs & 7) return 0; nb_text_blocs /= 8; } else { if (nb_text_blocs <= 0 || nb_text_blocs % 6) return 0; nb_text_blocs /= 6; } while (nb_text_blocs) { bloc = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; bloc--; if (bloc < 0 || bloc >= nb_blocs) return 0; if (the_blocks[bloc] == 1) return 0; if (version == 4) bloc_len = read_long(&error, p, nb_blocs, &reader, &size); else bloc_len = read_short(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (bloc_len <= 0 || bloc_len > 256 || bloc_len > len) return 0; len -= bloc_len; if (len < 0) return 0; if (ok) add_text(t, p+bloc*256, bloc_len); nb_text_blocs--; } if (len != 0) return 0; len = text_len; pos = 0; nb_char_style = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (nb_char_style < 0) return 0; if (version == 4) nb_char_style /= 8; else nb_char_style /= 6; while (nb_char_style) { int style, nb_style; if (version == 4) style = read_long(&error, p, nb_blocs, &reader, &size); else style = read_short(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; nb_style = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (nb_style < 0) return 0; /* it may be 0, don't ask me why */ len -= nb_style; if (len < 0) return 0; if (ok) { while (nb_style) { t->data[pos].char_style = style; pos++; nb_style--; } } nb_char_style--; } if (len != 0) return 0; len = text_len; pos = 0; nb_par_style = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (nb_par_style < 0) return 0; if (version == 4) nb_par_style /= 8; else nb_par_style /= 6; while (nb_par_style) { int style, nb_style; if (version == 4) style = read_long(&error, p, nb_blocs, &reader, &size); else style = read_short(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; nb_style = read_long(&error, p, nb_blocs, &reader, &size); if (error == -1) return 0; if (nb_style < 0) return 0; /* it may be 0, don't ask me why */ len -= nb_style; if (len < 0) return 0; if (ok) { while (nb_style) { t->data[pos].par_style = style; pos++; nb_style--; } } nb_par_style--; } if (len != 0) return 0; return 1; } void scan_blocks(char *p, int len) { int nb_blocks = len >> 8; int i; for (i=0; i') { fprintf(stdout, ">"); return; } fprintf(stdout, "%c", t1); return; #if 0 if (t1 >= 0x80) t1 = mac_to_unicode[t1-0x80]; if (t1 <= 0x7f) { fprintf(stdout, "%c", t1); if (t1 == '\n') fprintf(stdout, "

\n"); return; } if (t1 <= 0x7ff) { u0 = 0xc0 | ((t1 >> 6) & 31); u1 = 0x80 | (t1 & 0x3f); fprintf(stdout, "%c%c", u0, u1); return; } u0 = 0xe0 | ((t1 >> 12) & 15); u1 = 0x80 | ((t1 >> 6) & 63); u2 = 0x80 | (t1 & 63); fprintf(stdout, "%c%c%c", u0, u1, u2); #endif } #if 0 void merge_styles(text *t) { int i; int curstyle; if (t->size == 0) return; t->data[0].flag = CHAR_STYLE_START; curstyle = t->data[0].char_style; for (i = 1; isize; i++) { if (t->data[i].char_style == curstyle) continue; t->data[i-1].flag |= CHAR_STYLE_END; t->data[i].flag = CHAR_STYLE_START; curstyle = t->data[i].char_style; } t->data[i-1].flag |= CHAR_STYLE_END; } void normalize_styles(text *t) { int i; char c; for (i = 0; isize; i++) { if (t->data[i].flag != (CHAR_STYLE_START | CHAR_STYLE_END)) continue; c = t->data[i].c; if (c != ' ' && c != '\n' && c>0x20) continue; if (i >= 1) { t->data[i].char_style = t->data[i-1].char_style; t->data[i-1].flag &= ~CHAR_STYLE_END; t->data[i].flag &= ~CHAR_STYLE_START; if (i < t->size-1 && t->data[i].char_style == t->data[i+1].char_style) { t->data[i].flag &= ~CHAR_STYLE_END; t->data[i+1].flag &= ~CHAR_STYLE_START; } } else if (i < t->size-1) { t->data[i].char_style = t->data[i+1].char_style; t->data[i].flag &= ~CHAR_STYLE_END; t->data[i+1].flag &= ~CHAR_STYLE_START; } } } #endif void dump_text(int n) { int i; text *t = &docs.data[n]; /* let's merge char style infos */ //merge_styles(t); //normalize_styles(t); for (i=0; isize; i++) { //if (t->data[i].flag & CHAR_STYLE_START) // fprintf(stdout, "", t->data[i].char_style); dump_mac_char(t->data[i].c); //if (t->data[i].flag & CHAR_STYLE_END) // fprintf(stdout, ""); } fprintf(stdout, "\n

\n"); } void dump_texts(void) { int t=0; fprintf(stdout, "\n"); fprintf(stdout, "\n\n"); fprintf(stdout, "DUMP\n"); fprintf(stdout, "\n"); fprintf(stdout, "\n\n"); while (t < docs.size) { dump_text(t); t++; } fprintf(stdout, "\n\n"); } int main(int n, char **v) { FILE *fd; int len; char *p; struct stat buf; if (n!=3) { error: fprintf(stderr, " [-v3|-v4] \n"); fprintf(stderr, " -v3 : if you have a version 3 file\n"); fprintf(stderr, " -v4 : if it is a version 4 file\n"); return 1; } version =-1; if (strcmp(v[1], "-v3") == 0) version = 3; if (strcmp(v[1], "-v4") == 0) version = 4; if (version == -1) goto error; fd = fopen(v[2], "rb"); if (fd == NULL) { perror("fopen"); return 1; } if (fstat(fileno(fd), &buf) == -1) { perror("fstat"); return 1; } len = buf.st_size; p = malloc(len); if (p == NULL) { fprintf(stderr, "out of memory\n"); exit(1); } if (read(fileno(fd), p, buf.st_size) != buf.st_size) { fprintf(stderr, "error calling read\n"); exit(1); } #if 0 ssize_t rrr = buf.st_size; char *p2 = p; while (rrr) { size_t rret; size_t rin; rin = rrr; if (rin > 2048) rin = 2048; rret=read(fileno(fd), p2, rin); if (rret != rin) { fprintf(stderr, "error calling read %u\n", rret); exit(1); } p2 += rin; rrr -= rin; } #endif #if 0 p = mmap(0, len, PROT_READ, MAP_SHARED, fd, 0); if (p == MAP_FAILED) { perror("mmap"); return 1; } #endif if (len & 255) { fprintf(stderr, "bad length, not multiple of 256\n"); return 1; } the_blocks = calloc(len>>8, sizeof(char)); if (!the_blocks) { perror("calloc"); return 1; } mark_undata_blocks(p, len); scan_blocks(p, len); dump_texts(); return 0; }