sdcv/src/dictziplib.cpp

/* dictziplib.c --
 * http://stardict.sourceforge.net
 * Copyright (C) 2003-2003 Hu Zheng <huzheng_001@163.com>
 * This file is a modify version of dictd-1.9.7's data.c
 *
 * data.c --
 * Created: Tue Jul 16 12:45:41 1996 by faith@dict.org
 * Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org
 * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
 *
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

//#define HAVE_MMAP //it will defined in config.h. this can be done by configure.in with a AC_FUNC_MMAP.
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fcntl.h>
#include <limits.h>
#include <unistd.h>

#include <sys/stat.h>

#include "dictziplib.hpp"

#define USE_CACHE 1

#define BUFFERSIZE 10240

/*
 * Output buffer must be greater than or
 * equal to 110% of input buffer size, plus
 * 12 bytes.
*/
#define OUT_BUFFER_SIZE 0xffffL

#define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89))

/* For gzip-compatible header, as defined in RFC 1952 */

/* Magic for GZIP (rfc1952)                */
#define GZ_MAGIC1 0x1f /* First magic byte                        */
#define GZ_MAGIC2 0x8b /* Second magic byte                       */

/* FLaGs (bitmapped), from rfc1952         */
#define GZ_FTEXT 0x01 /* Set for ASCII text                      */
#define GZ_FHCRC 0x02 /* Header CRC16                            */
#define GZ_FEXTRA 0x04 /* Optional field (random access index)    */
#define GZ_FNAME 0x08 /* Original name                           */
#define GZ_COMMENT 0x10 /* Zero-terminated, human-readable comment */
#define GZ_MAX 2 /* Maximum compression                     */
#define GZ_FAST 4 /* Fasted compression                      */

/* These are from rfc1952                  */
#define GZ_OS_FAT 0 /* FAT filesystem (MS-DOS, OS/2, NT/Win32) */
#define GZ_OS_AMIGA 1 /* Amiga                                   */
#define GZ_OS_VMS 2 /* VMS (or OpenVMS)                        */
#define GZ_OS_UNIX 3 /* Unix                                    */
#define GZ_OS_VMCMS 4 /* VM/CMS                                  */
#define GZ_OS_ATARI 5 /* Atari TOS                               */
#define GZ_OS_HPFS 6 /* HPFS filesystem (OS/2, NT)              */
#define GZ_OS_MAC 7 /* Macintosh                               */
#define GZ_OS_Z 8 /* Z-System                                */
#define GZ_OS_CPM 9 /* CP/M                                    */
#define GZ_OS_TOPS20 10 /* TOPS-20                                 */
#define GZ_OS_NTFS 11 /* NTFS filesystem (NT)                    */
#define GZ_OS_QDOS 12 /* QDOS                                    */
#define GZ_OS_ACORN 13 /* Acorn RISCOS                            */
#define GZ_OS_UNKNOWN 255 /* unknown                                 */

#define GZ_RND_S1 'R' /* First magic for random access format    */
#define GZ_RND_S2 'A' /* Second magic for random access format   */

#define GZ_ID1 0 /* GZ_MAGIC1                               */
#define GZ_ID2 1 /* GZ_MAGIC2                               */
#define GZ_CM 2 /* Compression Method (Z_DEFALTED)         */
#define GZ_FLG 3 /* FLaGs (see above)                       */
#define GZ_MTIME 4 /* Modification TIME                       */
#define GZ_XFL 8 /* eXtra FLags (GZ_MAX or GZ_FAST)         */
#define GZ_OS 9 /* Operating System                        */
#define GZ_XLEN 10 /* eXtra LENgth (16bit)                    */
#define GZ_FEXTRA_START 12 /* Start of extra fields                   */
#define GZ_SI1 12 /* Subfield ID1                            */
#define GZ_SI2 13 /* Subfield ID2                            */
#define GZ_SUBLEN 14 /* Subfield length (16bit)                 */
#define GZ_VERSION 16 /* Version for subfield format             */
#define GZ_CHUNKLEN 18 /* Chunk length (16bit)                    */
#define GZ_CHUNKCNT 20 /* Number of chunks (16bit)                */
#define GZ_RNDDATA 22 /* Random access data (16bit)              */

#define DICT_UNKNOWN 0
#define DICT_TEXT 1
#define DICT_GZIP 2
#define DICT_DZIP 3

int DictData::read_header(const std::string &fname, int computeCRC)
{
    FILE *str;
    int id1, id2, si1, si2;
    char buffer[BUFFERSIZE];
    int extraLength, subLength;
    int i;
    char *pt;
    int c;
    struct stat sb;
    unsigned long crc = crc32(0L, Z_NULL, 0);
    int count;
    unsigned long offset;

    if (!(str = fopen(fname.c_str(), "rb"))) {
        //err_fatal_errno( __FUNCTION__,
        //       "Cannot open data file \"%s\" for read\n", filename );
        return -1;
    }

    this->headerLength = GZ_XLEN - 1;
    this->type = DICT_UNKNOWN;

    id1 = getc(str);
    id2 = getc(str);

    if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) {
        this->type = DICT_TEXT;
        fstat(fileno(str), &sb);
        this->compressedLength = this->length = sb.st_size;
        this->origFilename = fname;
        this->mtime = sb.st_mtime;
        if (computeCRC) {
            rewind(str);
            while (!feof(str)) {
                if ((count = fread(buffer, 1, BUFFERSIZE, str))) {
                    crc = crc32(crc, (Bytef *)buffer, count);
                }
            }
        }
        this->crc = crc;
        fclose(str);
        return 0;
    }
    this->type = DICT_GZIP;

    this->method = getc(str);
    this->flags = getc(str);
    this->mtime = getc(str) << 0;
    this->mtime |= getc(str) << 8;
    this->mtime |= getc(str) << 16;
    this->mtime |= getc(str) << 24;
    this->extraFlags = getc(str);
    this->os = getc(str);

    if (this->flags & GZ_FEXTRA) {
        extraLength = getc(str) << 0;
        extraLength |= getc(str) << 8;
        this->headerLength += extraLength + 2;
        si1 = getc(str);
        si2 = getc(str);

        if (si1 == GZ_RND_S1 || si2 == GZ_RND_S2) {
            subLength = getc(str) << 0;
            subLength |= getc(str) << 8;
            this->version = getc(str) << 0;
            this->version |= getc(str) << 8;

            if (this->version != 1) {
                //err_internal( __FUNCTION__,
                //	  "dzip header version %d not supported\n",
                //	  this->version );
            }

            this->chunkLength = getc(str) << 0;
            this->chunkLength |= getc(str) << 8;
            this->chunkCount = getc(str) << 0;
            this->chunkCount |= getc(str) << 8;

            if (this->chunkCount <= 0) {
                fclose(str);
                return 5;
            }
            this->chunks = (int *)malloc(sizeof(this->chunks[0])
                                         * this->chunkCount);
            for (i = 0; i < this->chunkCount; i++) {
                this->chunks[i] = getc(str) << 0;
                this->chunks[i] |= getc(str) << 8;
            }
            this->type = DICT_DZIP;
        } else {
            fseek(str, this->headerLength, SEEK_SET);
        }
    }

    if (this->flags & GZ_FNAME) { /* FIXME! Add checking against header len */
        pt = buffer;
        while ((c = getc(str)) && c != EOF)
            *pt++ = c;
        *pt = '\0';

        this->origFilename = buffer;
        this->headerLength += this->origFilename.length() + 1;
    } else {
        this->origFilename = "";
    }

    if (this->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */
        pt = buffer;
        while ((c = getc(str)) && c != EOF)
            *pt++ = c;
        *pt = '\0';
        comment = buffer;
        headerLength += comment.length() + 1;
    } else {
        comment = "";
    }

    if (this->flags & GZ_FHCRC) {
        getc(str);
        getc(str);
        this->headerLength += 2;
    }

    if (ftell(str) != this->headerLength + 1) {
        //err_internal( __FUNCTION__,
        //    "File position (%lu) != header length + 1 (%d)\n",
        //  ftell( str ), this->headerLength + 1 );
    }

    fseek(str, -8, SEEK_END);
    this->crc = getc(str) << 0;
    this->crc |= getc(str) << 8;
    this->crc |= getc(str) << 16;
    this->crc |= getc(str) << 24;
    this->length = getc(str) << 0;
    this->length |= getc(str) << 8;
    this->length |= getc(str) << 16;
    this->length |= getc(str) << 24;
    this->compressedLength = ftell(str);

    /* Compute offsets */
    this->offsets = (unsigned long *)malloc(sizeof(this->offsets[0])
                                            * this->chunkCount);
    for (offset = this->headerLength + 1, i = 0;
         i < this->chunkCount;
         i++) {
        this->offsets[i] = offset;
        offset += this->chunks[i];
    }

    fclose(str);
    return 0;
}

bool DictData::open(const std::string &fname, int computeCRC)
{
    struct stat sb;
    int fd;

    this->initialized = 0;

    if (stat(fname.c_str(), &sb) || !S_ISREG(sb.st_mode)) {
        //err_warning( __FUNCTION__,
        //   "%s is not a regular file -- ignoring\n", fname );
        return false;
    }

    if (read_header(fname, computeCRC)) {
        //err_fatal( __FUNCTION__,
        // "\"%s\" not in text or dzip format\n", fname );
        return false;
    }

    if ((fd = ::open(fname.c_str(), O_RDONLY)) < 0) {
        //err_fatal_errno( __FUNCTION__,
        //       "Cannot open data file \"%s\"\n", fname );
        return false;
    }
    if (fstat(fd, &sb)) {
        //err_fatal_errno( __FUNCTION__,
        //       "Cannot stat data file \"%s\"\n", fname );
        return false;
    }

    this->size = sb.st_size;
    ::close(fd);
    if (!mapfile.open(fname.c_str(), size))
        return false;

    this->start = mapfile.begin();
    this->end = this->start + this->size;

    for (size_t j = 0; j < DICT_CACHE_SIZE; j++) {
        cache[j].chunk = -1;
        cache[j].stamp = -1;
        cache[j].inBuffer = nullptr;
        cache[j].count = 0;
    }

    return true;
}

void DictData::close()
{
    if (this->chunks)
        free(this->chunks);
    if (this->offsets)
        free(this->offsets);

    if (this->initialized) {
        if (inflateEnd(&this->zStream)) {
            //err_internal( __FUNCTION__,
            //       "Cannot shut down inflation engine: %s\n",
            //     this->zStream.msg );
        }
    }

    for (size_t i = 0; i < DICT_CACHE_SIZE; ++i) {
        if (this->cache[i].inBuffer)
            free(this->cache[i].inBuffer);
    }
}

void DictData::read(char *buffer, unsigned long start, unsigned long size)
{
    char *pt;
    unsigned long end;
    int count;
    char *inBuffer;
    char outBuffer[OUT_BUFFER_SIZE];
    int firstChunk, lastChunk;
    int firstOffset, lastOffset;
    int i;
    int found, target, lastStamp;
    static int stamp = 0;

    end = start + size;

    //buffer = malloc( size + 1 );

    //PRINTF(DBG_UNZIP,
    // ("dict_data_read( %p, %lu, %lu )\n",
    //h, start, size ));

    switch (this->type) {
    case DICT_GZIP:
        //err_fatal( __FUNCTION__,
        // "Cannot seek on pure gzip format files.\n"
        // "Use plain text (for performance)"
        // " or dzip format (for space savings).\n" );
        break;
    case DICT_TEXT:
        memcpy(buffer, this->start + start, size);
        //buffer[size] = '\0';
        break;
    case DICT_DZIP:
        if (!this->initialized) {
            ++this->initialized;
            this->zStream.zalloc = nullptr;
            this->zStream.zfree = nullptr;
            this->zStream.opaque = nullptr;
            this->zStream.next_in = 0;
            this->zStream.avail_in = 0;
            this->zStream.next_out = nullptr;
            this->zStream.avail_out = 0;
            if (inflateInit2(&this->zStream, -15) != Z_OK) {
                //err_internal( __FUNCTION__,
                //  "Cannot initialize inflation engine: %s\n",
                //this->zStream.msg );
            }
        }
        firstChunk = start / this->chunkLength;
        firstOffset = start - firstChunk * this->chunkLength;
        lastChunk = end / this->chunkLength;
        lastOffset = end - lastChunk * this->chunkLength;
        //PRINTF(DBG_UNZIP,
        // ("   start = %lu, end = %lu\n"
        //"firstChunk = %d, firstOffset = %d,"
        //" lastChunk = %d, lastOffset = %d\n",
        //start, end, firstChunk, firstOffset, lastChunk, lastOffset ));
        for (pt = buffer, i = firstChunk; i <= lastChunk; i++) {

            /* Access cache */
            found = 0;
            target = 0;
            lastStamp = INT_MAX;
            for (size_t j = 0; j < DICT_CACHE_SIZE; j++) {
#if USE_CACHE
                if (this->cache[j].chunk == i) {
                    found = 1;
                    target = j;
                    break;
                }
#endif
                if (this->cache[j].stamp < lastStamp) {
                    lastStamp = this->cache[j].stamp;
                    target = j;
                }
            }

            this->cache[target].stamp = ++stamp;
            if (found) {
                count = this->cache[target].count;
                inBuffer = this->cache[target].inBuffer;
            } else {
                this->cache[target].chunk = i;
                if (!this->cache[target].inBuffer)
                    this->cache[target].inBuffer = (char *)malloc(IN_BUFFER_SIZE);
                inBuffer = this->cache[target].inBuffer;

                if (this->chunks[i] >= OUT_BUFFER_SIZE) {
                    //err_internal( __FUNCTION__,
                    //    "this->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n",
                    //  i, this->chunks[i], OUT_BUFFER_SIZE );
                }
                memcpy(outBuffer, this->start + this->offsets[i], this->chunks[i]);

                this->zStream.next_in = (Bytef *)outBuffer;
                this->zStream.avail_in = this->chunks[i];
                this->zStream.next_out = (Bytef *)inBuffer;
                this->zStream.avail_out = IN_BUFFER_SIZE;
                if (inflate(&this->zStream, Z_PARTIAL_FLUSH) != Z_OK) {
                    //err_fatal( __FUNCTION__, "inflate: %s\n", this->zStream.msg );
                }
                if (this->zStream.avail_in) {
                    //err_internal( __FUNCTION__,
                    //    "inflate did not flush (%d pending, %d avail)\n",
                    //  this->zStream.avail_in, this->zStream.avail_out );
                }

                count = IN_BUFFER_SIZE - this->zStream.avail_out;

                this->cache[target].count = count;
            }

            if (i == firstChunk) {
                if (i == lastChunk) {
                    memcpy(pt, inBuffer + firstOffset, lastOffset - firstOffset);
                    pt += lastOffset - firstOffset;
                } else {
                    if (count != this->chunkLength) {
                        //err_internal( __FUNCTION__,
                        //	"Length = %d instead of %d\n",
                        //count, this->chunkLength );
                    }
                    memcpy(pt, inBuffer + firstOffset,
                           this->chunkLength - firstOffset);
                    pt += this->chunkLength - firstOffset;
                }
            } else if (i == lastChunk) {
                memcpy(pt, inBuffer, lastOffset);
                pt += lastOffset;
            } else {
                assert(count == this->chunkLength);
                memcpy(pt, inBuffer, this->chunkLength);
                pt += this->chunkLength;
            }
        }
        //*pt = '\0';
        break;
    case DICT_UNKNOWN:
        //err_fatal( __FUNCTION__, "Cannot read unknown file type\n" );
        break;
    }
}