/*

epstopdf-fixjpg.c -- Heiko Purnhagen (purnhage@tnt.uni-hannover.de)

HP 20020825 20020826 20020827 20020830 20020902 20020903 20020921

Background:

  "get smaller PostSript/PDF files by using JPEG-compressed images"

  PostScript/EPS (Level 2 or higher) and PDF allow to include
  grayscale and color images in JPEG format, using the so-called
  DCTDecode filter.  ImageMagick's convert tool can be used to
  convert image.jpg to image.eps:

    convert image.jpg eps2:image.eps

  If you want to avoid JPEG->JPEG transcoding, you could use a tool
  like jpeg2ps, which basically puts a wrapper around the orignal JPEG
  file:

    jpeg2ps image.jpg > image.eps

  However, jpeg2ps's binary mode (-b) seems to produce EPS files that
  are hardly includable in larger PostScript files.  Furthermore, the
  EPS files generated by jpeg2ps don't work with the following
  epstopdf-fixjpg tool.  Hence, I hacked a tiny AWK script, jpg2eps,
  that "wraps" the original JPEG file, generating a EPS file with the
  same structure as one generated by convert:

    jpg2eps image.jpg > image.eps

  For non-JPEG images, use:

    convert -compress JPEG -quality 75 image.gif eps2:image.eps

  Unfortunately, the ps/eps export doesn't work correct in (quite) some
  versions of ImageMagick, but at least ImageMagick-5.4.4-5.tar.gz
  seems to be OK.

  Conversion of such an image.eps to image.pdf, for example by the
  epstopdf script (which should come with any decent pdflatex
  distribution), is a bit more troublesome.

    epstopdf --outfile=image.pdf image.eps 

  Depending on your version of ghostscript, you can get

  - large files using LZWDecode (gs 6.0)
  - small files using DCTDecode, but transcoded (gs 6.5, gs 7.0)

  While it is nice that the pdfwrite output device of ghostscript now
  produces compressed images by default, it sometime might be good to
  avoid transcoding for JPEG images.

  Hence, I wrote epstopdf-fixjpg as a quick hack to replace the images
  in image.pdf as generated by epstopdf by the "original" DCTDecode
  data in image.eps.  This tool can also work for eps files that
  include more than one DCTDecode images, e.g. a figure exported from
  xfig containing several DCTDecode EPS images.

Usage:

    gcc -o epstopdf-fixjpg epstopdf-fixjpg.c

  convert image.eps to image.pdf:

    epstopdf -o=image_tmp.pdf image.eps
    epstopdf-fixjpg image_tmp.pdf image.eps > image.pdf

  replace images in image_tmp.pdf by (concatenated) JPEGs in image.jpg

    epstopdf-fixjpg image_tmp.pdf image.jpg > image.pdf

  just re-generate xref table in PDF file:

    epstopdf-fixjpg image_tmp.pdf > image.pdf

Links:

  http://www.tnt.uni-hannover.de/~purnhage/software/epstopdf-fixjpg.c
  http://www.tnt.uni-hannover.de/~purnhage/software/jpg2eps
  http://www.imagemagick.org/www/convert.html
  http://www.cs.wisc.edu/~ghost/
  http://www-personal.umich.edu/~wsherman/eps_img/
  http://www.pdflib.com/jpeg2ps/index.html
  http://www.tug.org/applications/pdftex/epstopdf
  http://www.xfig.org/
  http://partners.adobe.com/asn/developer/acrosdk/DOCS/pdfspec.pdf

*/



#include <stdio.h>
#include <string.h>

#define MAXOBJ 1000
#define STRLEN 16384
#define KEYLEN 128

int getline(char *s, int *ic, int *oc, FILE *i, FILE *o)
{
  int n=0;
  int t;
  char *ss=s;

  while ((t=getc(i))!=EOF) {
    *ss++ = t;
    n++;
    (*ic)++;
    if (t=='\n')
      return n;
    if (n==STRLEN-KEYLEN) {
      if (o) {
	fwrite(s,1,STRLEN-KEYLEN,o);
	(*oc) += STRLEN-KEYLEN;
      }
      memmove(s,s+STRLEN-KEYLEN,KEYLEN);
      ss -= STRLEN-KEYLEN;
      n -= STRLEN-KEYLEN;
    }
  }
  return n;
}

int extractjpg(char *s, int epsisjpg, int numjpg, FILE *i, int *oc, FILE *o)
{
  int n;
  int t,t1,t2;

  rewind(i);
  if (epsisjpg) {
    t = getc(i);
    while (t!=EOF) {
      if (t!=0xff)
	t = getc(i);
      else if ((t=getc(i))==0xd8) {
	if (--numjpg == -1) {
	  n = 2;
	  if (o) {
	    putc(0xff,o);
	    putc(0xd8,o);
	    *oc += 2;
	  }
	  t1 = 0xff;
	  t2 = 0xd8;
	  t = getc(i);
	  while (t!=EOF) {
	    n++;
	    if (t!=0xff) {
	      if (o) {
		putc(t,o);
		(*oc)++;
	      }
	      t1 = t2;
	      t2 = t;
	      t = getc(i);
	    }
	    else {
	      if ((t=getc(i))==0xd8 && t1==0xff && t2==0xd9) {
		if (o) {
		  putc('\n',o);
		  (*oc)++;
		}
		return n-1;
	      }
	      else {
		if (o) {
		  putc(0xff,o);
		  (*oc)++;
		}
		t1 = t2;
		t2 = 0xff;
	      }
	    }
	  }
	  if (!n) {
	    fprintf(stderr,"epstopdf-fixjpg: jpg parse error\n");
	    exit (-1);
	  }
	  if (o) {
	    putc('\n',o);
	    (*oc)++;
	  }
	  return n;
	}
      }
    }
  }
  else
    while ((t=getc(i))!=EOF) {
      /* the following line might fail sometimes ... */
      if (t=='\n' && getc(i)==0xff && getc(i)==0xd8) {
	if (--numjpg == -1) {
	  n = 2;
	  if (o) {
	    putc(0xff,o);
	    putc(0xd8,o);
	    *oc += 2;
	  }
	  while (t=getline(s,&n,oc,i,o)) {
	    if (!strncmp(s,"%%EndData\n",t))
	      return n-10-1;
	    if (o) {
	      fwrite(s,1,t,o);
	      *oc += t;
	    }
	  }
	  fprintf(stderr,"epstopdf-fixjpg: eps parse error\n");
	  exit (-1);
	}
      }
    }
  return 0;
}

int main (int argc, char *argv[])
{
  FILE *pdff,*epsf,*outf;
  int pdfc,epsc,outc;
  int numjpg;
  int numobj,maxobj;
 
  int objoff[MAXOBJ+1];
  char pdfs[STRLEN+1];
  char epss[STRLEN+1];
  char outs[STRLEN+1];

  int epsisjpg;
  int n,obj,i;
  char *s;
  int ti;
  char ts[5];

  if (argc!=2 && argc!=3) {
    fprintf(stderr,
	    "epstopdf-fixjpg (HP20020921)\n"
	    "usage:    epstopdf-fixjpg image_tmp.pdf image.eps > image.pdf\n"
	    "   or:    epstopdf-fixjpg image_tmp.pdf image.jpg > image.pdf\n"
	    "fix xref: epstopdf-fixjpg image_tmp.pdf > image.pdf\n");
    exit (-1);
  }
  if ((pdff=fopen(argv[1],"r"))==NULL) {
    fprintf(stderr,
	    "epstopdf-fixjpg: can not open pdf \"%s\"\n",argv[1]);
    exit (-1);
  }
  if (argc==2)
    epsf = NULL;
  else
    if ((epsf=fopen(argv[2],"r"))==NULL) {
      fprintf(stderr,
	      "epstopdf-fixjpg: can not open eps/jpg \"%s\"\n",argv[2]);
      exit (-1);
    }
  if (epsf)
    epsisjpg = (getc(epsf)==0xff && getc(epsf)==0xd8);

  outf = stdout;
  pdfc = epsc = outc = 0;
  numjpg = 0;
  maxobj = numobj = 0;
  *epss = '\0';

  while (n=getline(pdfs,&pdfc,&outc,pdff,outf)) {
    *(pdfs+n) = '\0';
    if (strstr(pdfs," 0 obj") && sscanf(pdfs,"%d %d %4s",&obj,&ti,ts)==3 &&
	ti==0 && (!strcmp(ts,"obj") || !strcmp(ts,"obj "))) {
      if (obj>=MAXOBJ) {
	fprintf(stderr,"epstopdf-fixjpg: obj error\n");
	exit (-1);
      }
      numobj++;
      if (maxobj<obj) {
	for (i=maxobj+1; i<obj; i++)
	  objoff[obj] = 0;
	maxobj = obj;
      }
      objoff[obj] = outc;
    }
    else if (epsf && (s=strstr(pdfs,"/Subtype/Image/Length"))) {
      s += 14;
      *s = '\0';
      fprintf(outf,"%s",pdfs);
      outc += strlen(pdfs);
      n = 0;
      strcpy (epss+1,s+1);
      *epss = '/';
      /*
	Note: For gs 6.0, forward reference "/Length 8 0 R" is removed
	if "BitsPerComponent 8" is found in next line, but the object
	"8 0 obj" itself is not deleted ...  (For gs 6.5x and 7.0x,
	this check is not needed and has no effect)
      */
    }
    else if (epsf && (s=strstr(pdfs,"BitsPerComponent 8"))) {
      if (*epss) {
	fprintf(outf,"%c",'\n');
	outc++;
      }
      s += 18;
      *s = '\0';
      fprintf(outf,"%s\n",pdfs);
      outc += strlen(pdfs)+1;
      n -= strlen(pdfs)+1;
      while (strstr(s,"stream\n")!=s+n-7 && n>0) {
	n = getline(pdfs,&pdfc,NULL,pdff,NULL);
	s = pdfs;
      }
      n = extractjpg(epss,epsisjpg,numjpg,epsf,NULL,NULL);
      fprintf(stderr,"epstopdf-fixjpg: jpg%d: %d\n",numjpg,n);
      if (!n) {
	fprintf(stderr,"epstopdf-fixjpg: no jpg found\n");
	exit (-1);
      }
      sprintf(outs,"/Filter/DCTDecode/Length %d>>stream\n",n);
      fprintf(outf,"%s",outs);
      outc += strlen(outs);
      extractjpg(epss,epsisjpg,numjpg,epsf,&outc,outf);
      do {
	n = getline(pdfs,&pdfc,NULL,pdff,NULL);
	*(pdfs+n) = '\0';
      }
      while (strcmp(pdfs,"endstream\n")!=0 && n>0);
      fprintf(outf,"%s",pdfs);
      outc += strlen(pdfs);
      n = 0;
      numjpg++;
      *epss = '\0';
    }
    else if (!strcmp(pdfs,"xref\n")) {
      ti = 0;
      objoff[0] = 0;
      for (i=maxobj; i>=0; i--)
	if (!objoff[i]) {
	  objoff[i] = -ti;
	  ti = i;
	}
      sprintf(outs,"xref\n0 %d\n%010d 65535 f \n",maxobj+1,-objoff[0]);
      objoff[0] = outc;
      fprintf(outf,"%s",outs);
      outc += strlen(outs);
      for (i=1; i<=maxobj; i++)
	if (objoff[i]>0)
	  fprintf(outf,"%010d 00000 n \n",objoff[i]);
	else
	  fprintf(outf,"%010d 00000 f \n",-objoff[i]);
      outc += numobj*20;
      for (i=0; i<maxobj+2; i++)
	getline(pdfs,&pdfc,NULL,pdff,NULL);
      n = 0;
    }
    else if (!strcmp(pdfs,"startxref\n")) {
      sprintf(outs,"startxref\n%d\n",objoff[0]);
      fprintf(outf,"%s",outs);
      outc += strlen(outs);
      getline(pdfs,&pdfc,NULL,pdff,NULL);
      n = 0;
    }
    else if (*epss) {
      /* no "BitsPerComponent 8" found after "/Subtype/Image/Length" */
      fprintf(outf,"%s",epss);
      outc += strlen(epss);
      *epss = '\0';
    }
    if (n) {
      fwrite(pdfs,1,n,outf);
      outc += n;
    }
  }

  fprintf(stderr,"epstopdf-fixjpg: pdf: in=%d out=%d jpg=%d obj=%d max=%d\n",
	  pdfc,outc,numjpg,numobj,maxobj);
  fclose(pdff);
  if (epsf)
    fclose(epsf);
  return 0;
}


/*

this is just an attic with some internal notes ...


file/magic
0       beshort         0xffd8          JPEG image data
>6      string          JFIF            \b, JFIF standard
>6      string          Exif            \b, EXIF standard


od -N 10 -t x1 -a image.jpg
0000000 ff d8 ff e0 00 10 4a 46 49 46
        del   X del   ` nul dle   J   F   I   F
0000000 ff d8 ff e1 88 45 45 78 69 66
        del   X del   a  bs   E   E   x   i   f


od -t x1 image.jpg | tail -2
0141660 3e ba f2 28 8c ff d9
0141667


eps - convert 5.4.4:
userdict begin
%%BeginData:        24154 BINARY Bytes
DisplayImage
0 0
438 297
12.000000
438 297
1
0
1
0
8
ÿØ***
%%EndData


pdf - gs 6.0:
<</Type/XObject/Name/R7/Subtype/Image/Length 8 0 R
/ColorSpace /DeviceRGB/Width 438/Height 297/BitsPerComponent 8/Filter/LZWDecode
>>
stream
ÿØ***
endstream


pdf - gs 7.0:
/ColorSpace/DeviceGray
/Width 438
/Height 297
/BitsPerComponent 8
/Filter/DCTDecode/Length 21152>>stream
ÿØ***
endstream


grep -ab '^[0-9]\+ [0-9] obj' a75gc750.pdf
15:5 0 obj
225:6 0 obj
244:4 0 obj
415:3 0 obj
474:1 0 obj
522:7 0 obj
591:8 0 obj
21891:9 0 obj
45408:10 0 obj
45438:11 0 obj
45477:2 0 obj


grep -ab '^xref$' a75gc750.pdf
45638:xref


tail -25 a75gc750.pdf
2 0 obj
<</Producer(AFPL Ghostscript 7.04)
/Title(a75gc750.eps)
/Creator(fig2dev Version 3.2 Patchlevel 3d)
/Author(purnhage@abakus \(Heiko Purnhagen\))>>endobj
xref
0 12
0000000000 65535 f
0000000474 00000 n
0000045477 00000 n
0000000415 00000 n
0000000244 00000 n
0000000015 00000 n
0000000225 00000 n
0000000522 00000 n
0000000591 00000 n
0000021891 00000 n
0000045408 00000 n
0000045438 00000 n
trailer
<< /Size 12 /Root 1 0 R /Info 2 0 R
>>
startxref
45638
%%EOF


*/

