twain3.0/3rdparty/hgOCR/leptonica/encoding.c

650 lines
22 KiB
C

/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
- This software is distributed in the hope that it will be
- useful, but with NO WARRANTY OF ANY KIND.
- No author or distributor accepts responsibility to anyone for the
- consequences of using this software, or for whether it serves any
- particular purpose or works at all, unless he or she says so in
- writing. Everyone is granted permission to copy, modify and
- redistribute this source code, for commercial or non-commercial
- purposes, with the following restrictions: (1) the origin of this
- source code must not be misrepresented; (2) modified versions must
- be plainly marked as such; and (3) this notice may not be removed
- or altered from any source or modified source distribution.
*====================================================================*/
/*
* encodings.c
*
* Base64
* char *encodeBase64()
* l_uint8 *decodeBase64()
* static l_int32 isBase64()
* static l_int32 *genReverseTab64()
* static void byteConvert3to4()
* static void byteConvert4to3()
*
* Ascii85
* char *encodeAscii85()
* l_uint8 *decodeAscii85()
* static l_int32 convertChunkToAscii85()
*
* String reformatting for base 64 encoded data
* char *reformatPacked64()
*
* Base64 encoding is useful for encding binary data in a restricted set of
* 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
* and '/'. Notably it does not include quotes, so that base64 encoded
* strings can be used in situations where quotes are used for formatting.
* 64 symbols was chosen because it is the smallest number that can be used
* in 4-for-3 byte encoding of binary data:
* log2(64) / log2(256) = 0.75 = 3/4
*
* Ascii85 encoding is used in PostScript and some pdf files for
* representing binary data (for example, a compressed image) in printable
* ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
* it is the smallest number that can be used in 5-for-4 byte encoding
* of binary data (256 possible input values). This can be seen from
* the max information content in such a sequence:
* log2(84) / log2(256) = 0.799 < 4/5
* log2(85) / log2(256) = 0.801 > 4/5
*/
#include <ctype.h>
#include "allheaders.h"
/* Base64 encoding table in string representation */
static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
static const char *tablechar64 =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static l_int32 isBase64(char);
static l_int32 *genReverseTab64(void);
static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
/* Ascii85 encoding */
static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
static const l_uint32 power85[5] = {1,
85,
85 * 85,
85 * 85 * 85,
85 * 85 * 85 * 85};
static l_int32 convertChunkToAscii85(const l_uint8 *inarray, l_int32 insize,
l_int32 *pindex, char *outbuf,
l_int32 *pnbout);
/*-------------------------------------------------------------*
* Utility for encoding and decoding data with base64 *
*-------------------------------------------------------------*/
/*!
* \brief encodeBase64()
*
* \param[in] inarray input binary data
* \param[in] insize number of bytes in input array
* \param[out] poutsize number of bytes in output char array
* \return chara with MAX_BASE64_LINE characters + \n in each line
*
* <pre>
* Notes:
* (1) The input character data is unrestricted binary.
* The output encoded data consists of the 64 characters
* in the base64 set, plus newlines and the pad character '='.
* </pre>
*/
char *
encodeBase64(const l_uint8 *inarray,
l_int32 insize,
l_int32 *poutsize)
{
char *chara;
const l_uint8 *bytea;
l_uint8 array3[3], array4[4];
l_int32 outsize, i, j, index, linecount;
PROCNAME("encodeBase64");
if (!poutsize)
return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
*poutsize = 0;
if (!inarray)
return (char *)ERROR_PTR("inarray not defined", procName, NULL);
if (insize <= 0)
return (char *)ERROR_PTR("insize not > 0", procName, NULL);
/* The output array is padded to a multiple of 4 bytes, not
* counting the newlines. We just need to allocate a large
* enough array, and add 4 bytes to make sure it is big enough. */
outsize = 4 * ((insize + 2) / 3); /* without newlines */
outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
return (char *)ERROR_PTR("chara not made", procName, NULL);
/* Read all the input data, and convert in sets of 3 input
* bytes --> 4 output bytes. */
i = index = linecount = 0;
bytea = inarray;
while (insize--) {
if (linecount == MAX_BASE64_LINE) {
chara[index++] = '\n';
linecount = 0;
}
array3[i++] = *bytea++;
if (i == 3) { /* convert 3 to 4 and save */
byteConvert3to4(array3, array4);
for (j = 0; j < 4; j++)
chara[index++] = tablechar64[array4[j]];
i = 0;
linecount += 4;
}
}
/* Suppose 1 or 2 bytes has been read but not yet processed.
* If 1 byte has been read, this will generate 2 bytes of
* output, with 6 bits to the first byte and 2 bits to the second.
* We will add two bytes of '=' for padding.
* If 2 bytes has been read, this will generate 3 bytes of output,
* with 6 bits to the first 2 bytes and 4 bits to the third, and
* we add a fourth padding byte ('='). */
if (i > 0) { /* left-over 1 or 2 input bytes */
for (j = i; j < 3; j++)
array3[j] = '\0'; /* zero the remaining input bytes */
byteConvert3to4(array3, array4);
for (j = 0; j <= i; j++)
chara[index++] = tablechar64[array4[j]];
for (j = i + 1; j < 4; j++)
chara[index++] = '=';
}
*poutsize = index;
return chara;
}
/*!
* \brief decodeBase64()
*
* \param[in] inarray input encoded char data, with 72 chars/line)
* \param[in] insize number of bytes in input array
* \param[out] poutsize number of bytes in output byte array
* \return bytea decoded byte data, or NULL on error
*
* <pre>
* Notes:
* (1) The input character data should have only 66 different characters:
* The 64 character set for base64 encoding, plus the pad
* character '=' and newlines for formatting with fixed line
* lengths. If there are any other characters, the decoder
* will declare the input data to be invalid and return NULL.
* (2) The decoder ignores newlines and, for a valid input string,
* stops reading input when a pad byte is found.
* </pre>
*/
l_uint8 *
decodeBase64(const char *inarray,
l_int32 insize,
l_int32 *poutsize)
{
char inchar;
l_uint8 *bytea;
l_uint8 array3[3], array4[4];
l_int32 *rtable64;
l_int32 i, j, outsize, in_index, out_index;
PROCNAME("decodeBase64");
if (!poutsize)
return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
*poutsize = 0;
if (!inarray)
return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
if (insize <= 0)
return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
/* Validate the input data */
for (i = 0; i < insize; i++) {
inchar = inarray[i];
if (inchar == '\n') continue;
if (isBase64(inchar) == 0 && inchar != '=')
return (l_uint8 *)ERROR_PTR("invalid char in inarray",
procName, NULL);
}
/* The input array typically is made with a newline every
* MAX_BASE64_LINE input bytes. However, as a printed string, the
* newlines would be stripped. So when we allocate the output
* array, assume the input array is all data, but strip
* out the newlines during decoding. This guarantees that
* the allocated array is large enough. */
outsize = 3 * ((insize + 3) / 4) + 4;
if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
return (l_uint8 *)ERROR_PTR("bytea not made", procName, NULL);
/* The number of encoded input data bytes is always a multiple of 4.
* Read all the data, until you reach either the end or
* the first pad character '='. The data is processed in
* units of 4 input bytes, generating 3 output decoded bytes
* of binary data. Newlines are ignored. If there are no
* pad bytes, i == 0 at the end of this section. */
rtable64 = genReverseTab64();
i = in_index = out_index = 0;
for (in_index = 0; in_index < insize; in_index++) {
inchar = inarray[in_index];
if (inchar == '\n') continue;
if (inchar == '=') break;
array4[i++] = rtable64[(unsigned char)inchar];
if (i < 4) {
continue;
} else { /* i == 4; convert 4 to 3 and save */
byteConvert4to3(array4, array3);
for (j = 0; j < 3; j++)
bytea[out_index++] = array3[j];
i = 0;
}
}
/* If i > 0, we ran into pad bytes ('='). If i == 2, there are
* two input pad bytes and one output data byte. If i == 3,
* there is one input pad byte and two output data bytes. */
if (i > 0) {
for (j = i; j < 4; j++)
array4[j] = '\0'; /* zero the remaining input bytes */
byteConvert4to3(array4, array3);
for (j = 0; j < i - 1; j++)
bytea[out_index++] = array3[j];
}
*poutsize = out_index;
LEPT_FREE(rtable64);
return bytea;
}
/*!
* \brief isBase64()
*/
static l_int32
isBase64(char c)
{
return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
}
/*!
* \brief genReverseTab64()
*/
static l_int32 *
genReverseTab64()
{
l_int32 i;
l_int32 *rtable64;
rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
for (i = 0; i < 64; i++) {
rtable64[(unsigned char)tablechar64[i]] = i;
}
return rtable64;
}
/*!
* \brief byteConvert3to4()
*/
static void
byteConvert3to4(l_uint8 *in3,
l_uint8 *out4)
{
out4[0] = in3[0] >> 2;
out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
out4[3] = in3[2] & 0x3f;
return;
}
/*!
* \brief byteConvert4to3()
*/
static void
byteConvert4to3(l_uint8 *in4,
l_uint8 *out3)
{
out3[0] = (in4[0] << 2) | (in4[1] >> 4);
out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
return;
}
/*-------------------------------------------------------------*
* Utility for encoding and decoding data with ascii85 *
*-------------------------------------------------------------*/
/*!
* \brief encodeAscii85()
*
* \param[in] inarray input data
* \param[in] insize number of bytes in input array
* \param[out] poutsize number of bytes in output char array
* \return chara with 64 characters + \n in each line
*
* <pre>
* Notes:
* (1) Ghostscript has a stack break if the last line of
* data only has a '>', so we avoid the problem by
* always putting '~>' on the last line.
* </pre>
*/
char *
encodeAscii85(const l_uint8 *inarray,
l_int32 insize,
l_int32 *poutsize)
{
char *chara;
char outbuf[8];
l_int32 maxsize, i, index, outindex, linecount, nbout, eof;
PROCNAME("encodeAscii85");
if (!poutsize)
return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
*poutsize = 0;
if (!inarray)
return (char *)ERROR_PTR("inarray not defined", procName, NULL);
if (insize <= 0)
return (char *)ERROR_PTR("insize not > 0", procName, NULL);
/* Accumulate results in char array */
maxsize = (l_int32)(80. + (insize * 5. / 4.) *
(1. + 2. / MAX_ASCII85_LINE));
if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
return (char *)ERROR_PTR("chara not made", procName, NULL);
linecount = 0;
index = 0;
outindex = 0;
while (1) {
eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
for (i = 0; i < nbout; i++) {
chara[outindex++] = outbuf[i];
linecount++;
if (linecount >= MAX_ASCII85_LINE) {
chara[outindex++] = '\n';
linecount = 0;
}
}
if (eof == TRUE) {
if (linecount != 0)
chara[outindex++] = '\n';
chara[outindex++] = '~';
chara[outindex++] = '>';
chara[outindex++] = '\n';
break;
}
}
*poutsize = outindex;
return chara;
}
/*!
* \brief convertChunkToAscii85()
*
* \param[in] inarray input data
* \param[in] insize number of bytes in input array
* \param[out] pindex use and -- ptr
* \param[in] outbuf holds 8 ascii chars; we use no more than 7
* \param[out] pnbsout number of bytes written to outbuf
* \return boolean for eof 0 if more data, 1 if end of file
*
* <pre>
* Notes:
* (1) Attempts to read 4 bytes and write 5.
* (2) Writes 1 byte if the value is 0.
* </pre>
*/
static l_int32
convertChunkToAscii85(const l_uint8 *inarray,
l_int32 insize,
l_int32 *pindex,
char *outbuf,
l_int32 *pnbout)
{
l_uint8 inbyte;
l_uint32 inword, val;
l_int32 eof, index, nread, nbout, i;
eof = FALSE;
index = *pindex;
nread = L_MIN(4, (insize - index));
if (insize == index + nread)
eof = TRUE;
*pindex += nread; /* save new index */
/* Read input data and save in l_uint32 */
inword = 0;
for (i = 0; i < nread; i++) {
inbyte = inarray[index + i];
inword += inbyte << (8 * (3 - i));
}
#if 0
fprintf(stderr, "index = %d, nread = %d\n", index, nread);
fprintf(stderr, "inword = %x\n", inword);
fprintf(stderr, "eof = %d\n", eof);
#endif
/* Special case: output 1 byte only */
if (inword == 0) {
outbuf[0] = 'z';
nbout = 1;
} else { /* output nread + 1 bytes */
for (i = 4; i >= 4 - nread; i--) {
val = inword / power85[i];
outbuf[4 - i] = (l_uint8)(val + '!');
inword -= val * power85[i];
}
nbout = nread + 1;
}
*pnbout = nbout;
return eof;
}
/*!
* \brief decodeAscii85()
*
* \param[in] inarray ascii85 input data
* \param[in] insize number of bytes in input array
* \param[out] poutsize number of bytes in output l_uint8 array
* \return outarray binary
*
* <pre>
* Notes:
* (1) We assume the data is properly encoded, so we do not check
* for invalid characters or the final '>' character.
* (2) We permit whitespace to be added to the encoding in an
* arbitrary way.
* </pre>
*/
l_uint8 *
decodeAscii85(const char *inarray,
l_int32 insize,
l_int32 *poutsize)
{
char inc;
const char *pin;
l_uint8 val;
l_uint8 *outa;
l_int32 maxsize, ocount, bytecount, index;
l_uint32 oword;
PROCNAME("decodeAscii85");
if (!poutsize)
return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
*poutsize = 0;
if (!inarray)
return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
if (insize <= 0)
return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
/* Accumulate results in outa */
maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
return (l_uint8 *)ERROR_PTR("outa not made", procName, NULL);
pin = inarray;
ocount = 0; /* byte index into outa */
oword = 0;
for (index = 0, bytecount = 0; index < insize; index++, pin++) {
inc = *pin;
if (inc == ' ' || inc == '\t' || inc == '\n' ||
inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
continue;
val = inc - '!';
if (val < 85) {
oword = oword * 85 + val;
if (bytecount < 4) {
bytecount++;
} else { /* we have all 5 input chars for the oword */
outa[ocount] = (oword >> 24) & 0xff;
outa[ocount + 1] = (oword >> 16) & 0xff;
outa[ocount + 2] = (oword >> 8) & 0xff;
outa[ocount + 3] = oword & 0xff;
ocount += 4;
bytecount = 0;
oword = 0;
}
} else if (inc == 'z' && bytecount == 0) {
outa[ocount] = 0;
outa[ocount + 1] = 0;
outa[ocount + 2] = 0;
outa[ocount + 3] = 0;
ocount += 4;
} else if (inc == '~') { /* end of data */
L_INFO(" %d extra bytes output\n", procName, bytecount - 1);
switch (bytecount) {
case 0: /* normal eof */
case 1: /* error */
break;
case 2: /* 1 extra byte */
oword = oword * power85[3] + 0xffffff;
outa[ocount] = (oword >> 24) & 0xff;
break;
case 3: /* 2 extra bytes */
oword = oword * power85[2] + 0xffff;
outa[ocount] = (oword >> 24) & 0xff;
outa[ocount + 1] = (oword >> 16) & 0xff;
break;
case 4: /* 3 extra bytes */
oword = oword * 85 + 0xff;
outa[ocount] = (oword >> 24) & 0xff;
outa[ocount + 1] = (oword >> 16) & 0xff;
outa[ocount + 2] = (oword >> 8) & 0xff;
break;
}
if (bytecount > 1)
ocount += (bytecount - 1);
break;
}
}
*poutsize = ocount;
return outa;
}
/*-------------------------------------------------------------*
* String reformatting for base 64 encoded data *
*-------------------------------------------------------------*/
/*!
* \brief reformatPacked64()
*
* \param[in] inarray base64 encoded string with newlines
* \param[in] insize number of bytes in input array
* \param[in] leadspace number of spaces in each line before the data
* \param[in] linechars number of bytes of data in each line; multiple of 4
* \param[in] addquotes 1 to add quotes to each line of data; 0 to skip
* \param[out] poutsize number of bytes in output char array
* \return outarray ascii
*
* <pre>
* Notes:
* (1) Each line in the output array has %leadspace space characters,
* followed optionally by a double-quote, followed by %linechars
* bytes of base64 data, followed optionally by a double-quote,
* followed by a newline.
* (2) This can be used to convert a base64 encoded string to a
* string formatted for inclusion in a C source file.
* </pre>
*/
char *
reformatPacked64(const char *inarray,
l_int32 insize,
l_int32 leadspace,
l_int32 linechars,
l_int32 addquotes,
l_int32 *poutsize)
{
char *flata, *outa;
l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
PROCNAME("reformatPacked64");
if (!poutsize)
return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
*poutsize = 0;
if (!inarray)
return (char *)ERROR_PTR("inarray not defined", procName, NULL);
if (insize <= 0)
return (char *)ERROR_PTR("insize not > 0", procName, NULL);
if (leadspace < 0)
return (char *)ERROR_PTR("leadspace must be >= 0", procName, NULL);
if (linechars % 4)
return (char *)ERROR_PTR("linechars % 4 must be 0", procName, NULL);
/* Remove all white space */
if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
return (char *)ERROR_PTR("flata not made", procName, NULL);
for (i = 0, flatindex = 0; i < insize; i++) {
if (isBase64(inarray[i]) || inarray[i] == '=')
flata[flatindex++] = inarray[i];
}
/* Generate output string */
flatsize = flatindex;
nlines = (flatsize + linechars - 1) / linechars;
linewithpad = leadspace + linechars + 1; /* including newline */
if (addquotes) linewithpad += 2;
if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
sizeof(char))) == NULL) {
LEPT_FREE(flata);
return (char *)ERROR_PTR("outa not made", procName, NULL);
}
for (j = 0, outindex = 0; j < leadspace; j++)
outa[outindex++] = ' ';
if (addquotes) outa[outindex++] = '"';
for (i = 0, linecount = 0; i < flatsize; i++) {
if (linecount == linechars) {
if (addquotes) outa[outindex++] = '"';
outa[outindex++] = '\n';
for (j = 0; j < leadspace; j++)
outa[outindex++] = ' ';
if (addquotes) outa[outindex++] = '"';
linecount = 0;
}
outa[outindex++] = flata[i];
linecount++;
}
if (addquotes) outa[outindex++] = '"';
*poutsize = outindex;
LEPT_FREE(flata);
return outa;
}