/*
 * xpress2html : extract texts from a Quark Xpress file
 * and saves it in an html file, encoded with UTF-8.
 * (only tested with versions 3.3 and 4.0 for Mac; it
 * probably won't work with something else, just try...)
 *
 * xpress2html is brutal. It will scan all the
 * blocs of the XPress file and 'guess' if they
 * are blocs describing texts or not.
 * It may fail in the process, you may get
 * in the html file some data which is not real
 * text. But the probability is low, I think.
 * xpress2html should extract ALL the texts
 * from the file, as far as the format doesn't
 * change too much.
 *
 * Texts will be extracted in the order they
 * appear in the file, which probably will be
 * different from what you have in the XPress
 * output. Hey, I didn't create the file
 * format, I'm not responsible for this.
 *
 * All formatting information will be lost.
 * It's tricky to extract this. A font
 * named "myfont", how do you know if
 * the text will be italic, bold, and
 * the like ? xpress2html simply doesn't
 * handle this, unfornately.
 *
 * xpress2html does only handle Mac files.
 * Who uses XPress with Windows by the way ?
 * The character set is supposed to be
 * Mac Roman.
 *
 * xpress2html only works under linux.
 * One may modify it to let it work under
 * other systems. Ask me some help if
 * you really need it.
 *
 * latest version available at:
 * http://sed.free.fr/xpress2html.c
 *
 * Released in the public domain.
 * Freedom hates proprietary formats.
 * Freedom hates licenses.
 *
 * This program would simply not exist if Frans Faase
 * didn't hack those XPress files for something like
 * two years. Special greetings must fly to him.
 * See his website:
 * http://home.planet.nl/~faase009/QX.html
 *
 * Quark is going to lose the game. Keep your
 * file format secret, who cares. At some time
 * in the future, someone somewhere will write
 * from scratch an Xpress like program, based
 * on free technologies, available for everyone,
 * easy to use, friendly, all what one may dream
 * about. And it will be free software.
 *
 * In the meantime, people still use XPress, and
 * need to extract data from THEIR files, and
 * don't want to waste their money on accessing
 * THEIR own data. So, people with some hacking
 * skills will produce little tools like this one.
 *
 * Release info:
 * Fri, 30 May 2003 18:28:41 +0200 - starting some hach
 * Sun,  1 Jun 2003 22:31:34 +0200 - beta version
 *
 * Hacked by Sed.
 * You may contact me at: sed@free.fr
 *
 * Sorry for my bad english.
 *
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

static char *the_blocks;

static int version;

#define CHAR_STYLE_START	0x01
#define CHAR_STYLE_END		0x02

typedef struct {
  unsigned char c;
  char flag;
  int char_style;
  int par_style;
} carac;

typedef struct {
  carac *data;
  int size;
  int maxsize;
} text;

typedef struct {
  text *data;
  int size;
  int maxsize;
} document;

document docs = { data:0, size:0, maxsize:0 };

text *new_text(void)
{
  if (docs.maxsize == docs.size) {
    docs.data = realloc(docs.data, (docs.maxsize+=64) * sizeof(text));
    if (!docs.data) {
      perror("realloc");
      exit(0);
    }
  }

  docs.data[docs.size].data = 0;
  docs.data[docs.size].size = docs.data[docs.size].maxsize = 0;

  docs.size++;

  return &docs.data[docs.size-1];
}

void add_text(text *t, char *c, int len)
{
  int i;
  int ns = len + t->size + 128;

  if (ns > t->maxsize) {
    t->data = realloc(t->data, ns * sizeof(carac));
    if (!t->data) {
      perror("realloc");
      exit(0);
    }
    t->maxsize = ns;
  }

  for (i=t->size; i<t->size+len; i++, c++) {
    t->data[i].c = *c;
    t->data[i].flag = 0;
    t->data[i].char_style = t->data[i].par_style = 0;
  }

  t->size += len;
}

long read_bloc_long(unsigned char *p, int b, int pos)
{
  p += b * 256 + pos;

  return p[0] |
        (p[1] << 8) |
        (p[2] << 16) |
        (p[3] << 24);
}

short read_bloc_short(unsigned char *p, int b, int pos)
{
  p += b * 256 + pos;

  return p[0] |
        (p[1] << 8);
}

void mark_bloc(char *p, int bloc, int len)
{
  int i;

  for (i=bloc; len; i++, len--)
    the_blocks[i] = 1;
}

void mark_undata_blocks(char *p, int l) /* l is length of file */
{
  int bloc = 2;
  int nb_blocs = l >> 8;
  int len = 1;	/* initial bloc has length 1 bloc */
  int next;

  the_blocks[0] = 1;
  the_blocks[1] = 1;

  /* the 2 first blocs are header, then bloc 2 has length 1 bloc */
  while (bloc < nb_blocs) {
    if (bloc + len > nb_blocs) break;

    mark_bloc(p, bloc, len);

    /* the next is to the end of current block */
    next = read_bloc_long(p, bloc, len * 256 - 4);

    if (next == 0) return;

    if (next < 0) {
      next = -next;
      next--;

      if (next >= nb_blocs) break;

      /* next < 0 => len of bloc is the first 2 bytes of the bloc */
      len = read_bloc_short(p, next, 0);
      bloc = next;

      continue;
    }

    /* next > 0 => len of bloc is 1 */
    next--;
    len = 1;
    bloc = next;
  }

  fprintf(stderr, "bad xpress file\n");
  exit(0);
}

long _read_long(unsigned char *p)
{
  return p[0] |
        (p[1] << 8) |
        (p[2] << 16) |
        (p[3] << 24);
}

short _read_short(unsigned char *p)
{
  return p[0] |
        (p[1] << 8);
}

/* set by caller to know if the blocs will be marked used or not */
static int move_and_mark;
int move_reader(unsigned char *start, int nb_blocs, unsigned char **reader, int *size, int count)
{
  int next;
  int len;

  while(count) {
    count--;

    if (*reader >= start + nb_blocs*256) return -1;

    if (*size) { (*size)--; (*reader)++; continue; }

    next = _read_long(*reader);

    if (next == 0) return -1;

    if (next > 0) {
      next--;
      if (next >= nb_blocs) return -1;
      if (the_blocks[next] == 1) return -1;
      *size = 252;
      if (*size < count) return -1;
      *reader = start + next * 256;

      if (move_and_mark) the_blocks[next] = 1;

      continue;
    }

    /* next < 0 */
    next = -next;
    next--;

    if (next >= nb_blocs) return -1;
    if (the_blocks[next] == 1) return -1;

    *reader = start + next * 256 + 2;
    if (*reader >= start + nb_blocs * 256) return -1;

    len = _read_short(*reader - 2);
    if (len <= 0) return -1;

    *size = len * 256 - 6;

    if (move_and_mark) the_blocks[next] = 1;
  }

  return 0;
}

long read_long(int *error, char *start, int nb_blocs, unsigned char **reader, int *size)
{
  unsigned char p3, p2, p1, p0;

  *error = -1;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p0 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p1 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p2 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p3 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  *error = 0;

  return  p0 |
         (p1 << 8) |
         (p2 << 16) |
         (p3 << 24);
}

short read_short(int *error, char *start, int nb_blocs, unsigned char **reader, int *size)
{
  unsigned char p1, p0;

  *error = -1;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p0 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  if (*size == 0 && move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;
  p1 = **reader;
  if (move_reader(start, nb_blocs, reader, size, 1) == -1) return 0;

  *error = 0;

  return p0 |
         (p1 << 8);
}

/* return 1 if maybe text bloc, 0 if sure that not */
int is_text(char *p, int nb_blocs, int b, int ok)
{
  int error;
  int len;
  unsigned char *reader = p + b * 256;
  int size = 252;
  int bloc, bloc_len;
  int nb_text_blocs;
  int text_len;
  int nb_char_style;
  int nb_par_style;
  text *t;
  int pos;

  if (ok) {
    the_blocks[b] = 1;

    t = new_text();
  }

  move_and_mark = ok;

  /* the pointer must always be on the last read character, so we must do it by hand for the first long */
  text_len = len = read_long(&error, p, nb_blocs, &reader, &size);
  if (error == -1) return 0;
  if (len <= 0) return 0;

  nb_text_blocs = read_long(&error, p, nb_blocs, &reader, &size);
  if (error == -1) return 0;
  if (version == 4) {
    if (nb_text_blocs <= 0 || nb_text_blocs & 7) return 0;
    nb_text_blocs /= 8;
  } else {
    if (nb_text_blocs <= 0 || nb_text_blocs % 6) return 0;
    nb_text_blocs /= 6;
  }

  while (nb_text_blocs) {
    bloc = read_long(&error, p, nb_blocs, &reader, &size);
    if (error == -1) return 0;
    bloc--;
    if (bloc < 0 || bloc >= nb_blocs) return 0;
    if (the_blocks[bloc] == 1) return 0;

    if (version == 4)
      bloc_len = read_long(&error, p, nb_blocs, &reader, &size);
    else
      bloc_len = read_short(&error, p, nb_blocs, &reader, &size);
    if (error == -1) return 0;
    if (bloc_len <= 0 || bloc_len > 256 || bloc_len > len) return 0;

    len -= bloc_len;
    if (len < 0) return 0;

    if (ok) add_text(t, p+bloc*256, bloc_len);

    nb_text_blocs--;
  }

  if (len != 0) return 0;

  len = text_len;

  pos = 0;

  nb_char_style = read_long(&error, p, nb_blocs, &reader, &size);
  if (error == -1) return 0;
  if (nb_char_style < 0) return 0;
  if (version == 4)
    nb_char_style /= 8;
  else
    nb_char_style /= 6;

  while (nb_char_style) {
    int style, nb_style;

    if (version == 4)
      style = read_long(&error, p, nb_blocs, &reader, &size);
    else
      style = read_short(&error, p, nb_blocs, &reader, &size);

    if (error == -1) return 0;

    nb_style = read_long(&error, p, nb_blocs, &reader, &size);
    if (error == -1) return 0;
    if (nb_style < 0) return 0;  /* it may be 0, don't ask me why */

    len -= nb_style;
    if (len < 0) return 0;

    if (ok) {
      while (nb_style) {
	t->data[pos].char_style = style;
	pos++;
	nb_style--;
      }
    }

    nb_char_style--;
  }

  if (len != 0) return 0;

  len = text_len;

  pos = 0;

  nb_par_style = read_long(&error, p, nb_blocs, &reader, &size);
  if (error == -1) return 0;
  if (nb_par_style < 0) return 0;
  if (version == 4)
    nb_par_style /= 8;
  else
    nb_par_style /= 6;

  while (nb_par_style) {
    int style, nb_style;

    if (version == 4)
      style = read_long(&error, p, nb_blocs, &reader, &size);
    else
      style = read_short(&error, p, nb_blocs, &reader, &size);

    if (error == -1) return 0;

    nb_style = read_long(&error, p, nb_blocs, &reader, &size);
    if (error == -1) return 0;
    if (nb_style < 0) return 0;  /* it may be 0, don't ask me why */

    len -= nb_style;
    if (len < 0) return 0;

    if (ok) {
      while (nb_style) {
	t->data[pos].par_style = style;
	pos++;
	nb_style--;
      }
    }

    nb_par_style--;
  }

  if (len != 0) return 0;

  return 1;
}

void scan_blocks(char *p, int len)
{
  int nb_blocks = len >> 8;
  int i;

  for (i=0; i<nb_blocks; i++) {
    if (the_blocks[i] == 1)
      continue;

    if (is_text(p, nb_blocks, i, 0) == 0)
      continue;

    //fprintf(stdout, "text block %8.8x %d\n", i, i);
    is_text(p, nb_blocks, i, 1);
  }
}

/* ripped from :
 * http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
 */
int mac_to_unicode[] = {
0x00C4,
0x00C5,
0x00C7,
0x00C9,
0x00D1,
0x00D6,
0x00DC,
0x00E1,
0x00E0,
0x00E2,
0x00E4,
0x00E3,
0x00E5,
0x00E7,
0x00E9,
0x00E8,
0x00EA,
0x00EB,
0x00ED,
0x00EC,
0x00EE,
0x00EF,
0x00F1,
0x00F3,
0x00F2,
0x00F4,
0x00F6,
0x00F5,
0x00FA,
0x00F9,
0x00FB,
0x00FC,
0x2020,
0x00B0,
0x00A2,
0x00A3,
0x00A7,
0x2022,
0x00B6,
0x00DF,
0x00AE,
0x00A9,
0x2122,
0x00B4,
0x00A8,
0x2260,
0x00C6,
0x00D8,
0x221E,
0x00B1,
0x2264,
0x2265,
0x00A5,
0x00B5,
0x2202,
0x2211,
0x220F,
0x03C0,
0x222B,
0x00AA,
0x00BA,
0x03A9,
0x00E6,
0x00F8,
0x00BF,
0x00A1,
0x00AC,
0x221A,
0x0192,
0x2248,
0x2206,
0x00AB,
0x00BB,
0x2026,
0x00A0,
0x00C0,
0x00C3,
0x00D5,
0x0152,
0x0153,
0x2013,
0x2014,
0x201C,
0x201D,
0x2018,
0x2019,
0x00F7,
0x25CA,
0x00FF,
0x0178,
0x2044,
0x20AC,
0x2039,
0x203A,
0xFB01,
0xFB02,
0x2021,
0x00B7,
0x201A,
0x201E,
0x2030,
0x00C2,
0x00CA,
0x00C1,
0x00CB,
0x00C8,
0x00CD,
0x00CE,
0x00CF,
0x00CC,
0x00D3,
0x00D4,
0xF8FF,
0x00D2,
0x00DA,
0x00DB,
0x00D9,
0x0131,
0x02C6,
0x02DC,
0x00AF,
0x02D8,
0x02D9,
0x02DA,
0x00B8,
0x02DD,
0x02DB,
0x02C7,
};

void dump_mac_char(unsigned char t)
{
  int t1;
  int u0, u1, u2;

  /* let's remove all the 0x1f */
  if (t == 0x1f) return;

  /* '-' (0x2D) followed by ^G (0x07) could be removed too, don't know... */

  t1 = t;

  if (t <= 0x20) t1 = ' ';
  if (t == 13) t1 = '\n';

#if 0
  if (t == 0xd5) t1 = '\'';
  if (t == 0xd4) t1 = '`';

  if (t == 0xd0) t1 = '-';
  if (t == 0xd1) t1 = '-';
#endif

  if (t1 == '<') {
    fprintf(stdout, "&lt;");
    return;
  }

  if (t1 == '>') {
    fprintf(stdout, "&gt;");
    return;
  }

  fprintf(stdout, "%c", t1);
  return;

#if 0
  if (t1 >= 0x80)
    t1 = mac_to_unicode[t1-0x80];

  if (t1 <= 0x7f) {
    fprintf(stdout, "%c", t1);

    if (t1 == '\n') fprintf(stdout, "<p>\n");
    return;
  }

  if (t1 <= 0x7ff) {
    u0 = 0xc0 | ((t1 >> 6) & 31);
    u1 = 0x80 | (t1 & 0x3f);

    fprintf(stdout, "%c%c", u0, u1);

    return;
  }

  u0 = 0xe0 | ((t1 >> 12) & 15);
  u1 = 0x80 | ((t1 >> 6) & 63);
  u2 = 0x80 | (t1 & 63);

  fprintf(stdout, "%c%c%c", u0, u1, u2);
#endif
}

#if 0
void merge_styles(text *t)
{
  int i;
  int curstyle;

  if (t->size == 0) return;

  t->data[0].flag = CHAR_STYLE_START;
  curstyle = t->data[0].char_style;

  for (i = 1; i<t->size; i++) {
    if (t->data[i].char_style == curstyle) continue;

    t->data[i-1].flag |= CHAR_STYLE_END;
    t->data[i].flag = CHAR_STYLE_START;
    curstyle = t->data[i].char_style;
  }

  t->data[i-1].flag |= CHAR_STYLE_END;
}

void normalize_styles(text *t)
{
  int i;
  char c;

  for (i = 0; i<t->size; i++) {
    if (t->data[i].flag != (CHAR_STYLE_START | CHAR_STYLE_END)) continue;
    c = t->data[i].c;
    if (c != ' ' && c != '\n' && c>0x20) continue;
    if (i >= 1) {
      t->data[i].char_style = t->data[i-1].char_style;
      t->data[i-1].flag &= ~CHAR_STYLE_END;
      t->data[i].flag &= ~CHAR_STYLE_START;
      if (i < t->size-1 && t->data[i].char_style == t->data[i+1].char_style) {
        t->data[i].flag &= ~CHAR_STYLE_END;
        t->data[i+1].flag &= ~CHAR_STYLE_START;
      }
    } else if (i < t->size-1) {
      t->data[i].char_style = t->data[i+1].char_style;
      t->data[i].flag &= ~CHAR_STYLE_END;
      t->data[i+1].flag &= ~CHAR_STYLE_START;
    }
  }
}
#endif

void dump_text(int n)
{
  int i;
  text *t = &docs.data[n];

  /* let's merge char style infos */
  //merge_styles(t);
  //normalize_styles(t);

  for (i=0; i<t->size; i++) {
    //if (t->data[i].flag & CHAR_STYLE_START)
    //  fprintf(stdout, "<font style=\"s%d\">", t->data[i].char_style);

    dump_mac_char(t->data[i].c);

    //if (t->data[i].flag & CHAR_STYLE_END)
    //  fprintf(stdout, "</font>");
  }

  fprintf(stdout, "\n<hr>\n");
}

void dump_texts(void)
{
  int t=0;

  fprintf(stdout, "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 3.2//EN\">\n");
  fprintf(stdout, "<html>\n<head>\n");
  fprintf(stdout, "<title>DUMP</title>\n");
  fprintf(stdout, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=WINDOWS-1252\">\n");
  fprintf(stdout, "</head>\n<body>\n");

  while (t < docs.size) {
    dump_text(t);
    t++;
  }

  fprintf(stdout, "</body>\n</html>\n");
}

int main(int n, char **v)
{
  FILE *fd;
  int len;
  char *p;
  struct stat buf;

  if (n!=3) {
error:
    fprintf(stderr, "<prog> [-v3|-v4] <XPress file>\n");
    fprintf(stderr, "    -v3 : if you have a version 3 file\n");
    fprintf(stderr, "    -v4 : if it is a version 4 file\n");
    return 1;
  }

  version =-1;
  if (strcmp(v[1], "-v3") == 0) version = 3;
  if (strcmp(v[1], "-v4") == 0) version = 4;
  if (version == -1) goto error;

  fd = fopen(v[2], "rb");
  if (fd == NULL) {
    perror("fopen");
    return 1;
  }

  if (fstat(fileno(fd), &buf) == -1) {
    perror("fstat");
    return 1;
  }

  len = buf.st_size;

  p = malloc(len);
  if (p == NULL) { fprintf(stderr, "out of memory\n"); exit(1); }
  if (read(fileno(fd), p, buf.st_size) != buf.st_size)
    { fprintf(stderr, "error calling read\n"); exit(1); }
#if 0
  ssize_t rrr = buf.st_size;
  char *p2 = p;
  while (rrr) {
    size_t rret;
    size_t rin;
    rin = rrr;
    if (rin > 2048) rin = 2048;
    rret=read(fileno(fd), p2, rin);
    if (rret != rin)
      { fprintf(stderr, "error calling read %u\n", rret); exit(1); }
    p2 += rin;
    rrr -= rin;
  }
#endif
#if 0
  p = mmap(0, len, PROT_READ, MAP_SHARED, fd, 0);
  if (p == MAP_FAILED) {
    perror("mmap");
    return 1;
  }
#endif

  if (len & 255) {
    fprintf(stderr, "bad length, not multiple of 256\n");
    return 1;
  }

  the_blocks = calloc(len>>8, sizeof(char));
  if (!the_blocks) {
    perror("calloc");
    return 1;
  }

  mark_undata_blocks(p, len);

  scan_blocks(p, len);

  dump_texts();

  return 0;
}
