/* dictziplib.c -- * http://stardict.sourceforge.net * Copyright (C) 2003-2003 Hu Zheng * This file is a modify version of dictd-1.9.7's data.c * * data.c -- * Created: Tue Jul 16 12:45:41 1996 by faith@dict.org * Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org) * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ //#define HAVE_MMAP //it will defined in config.h. this can be done by configure.in with a AC_FUNC_MMAP. #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include #include #include #include "dictziplib.hpp" #define USE_CACHE 1 #define BUFFERSIZE 10240 /* * Output buffer must be greater than or * equal to 110% of input buffer size, plus * 12 bytes. */ #define OUT_BUFFER_SIZE 0xffffL #define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89)) /* For gzip-compatible header, as defined in RFC 1952 */ /* Magic for GZIP (rfc1952) */ #define GZ_MAGIC1 0x1f /* First magic byte */ #define GZ_MAGIC2 0x8b /* Second magic byte */ /* FLaGs (bitmapped), from rfc1952 */ #define GZ_FTEXT 0x01 /* Set for ASCII text */ #define GZ_FHCRC 0x02 /* Header CRC16 */ #define GZ_FEXTRA 0x04 /* Optional field (random access index) */ #define GZ_FNAME 0x08 /* Original name */ #define GZ_COMMENT 0x10 /* Zero-terminated, human-readable comment */ #define GZ_MAX 2 /* Maximum compression */ #define GZ_FAST 4 /* Fasted compression */ /* These are from rfc1952 */ #define GZ_OS_FAT 0 /* FAT filesystem (MS-DOS, OS/2, NT/Win32) */ #define GZ_OS_AMIGA 1 /* Amiga */ #define GZ_OS_VMS 2 /* VMS (or OpenVMS) */ #define GZ_OS_UNIX 3 /* Unix */ #define GZ_OS_VMCMS 4 /* VM/CMS */ #define GZ_OS_ATARI 5 /* Atari TOS */ #define GZ_OS_HPFS 6 /* HPFS filesystem (OS/2, NT) */ #define GZ_OS_MAC 7 /* Macintosh */ #define GZ_OS_Z 8 /* Z-System */ #define GZ_OS_CPM 9 /* CP/M */ #define GZ_OS_TOPS20 10 /* TOPS-20 */ #define GZ_OS_NTFS 11 /* NTFS filesystem (NT) */ #define GZ_OS_QDOS 12 /* QDOS */ #define GZ_OS_ACORN 13 /* Acorn RISCOS */ #define GZ_OS_UNKNOWN 255 /* unknown */ #define GZ_RND_S1 'R' /* First magic for random access format */ #define GZ_RND_S2 'A' /* Second magic for random access format */ #define GZ_ID1 0 /* GZ_MAGIC1 */ #define GZ_ID2 1 /* GZ_MAGIC2 */ #define GZ_CM 2 /* Compression Method (Z_DEFALTED) */ #define GZ_FLG 3 /* FLaGs (see above) */ #define GZ_MTIME 4 /* Modification TIME */ #define GZ_XFL 8 /* eXtra FLags (GZ_MAX or GZ_FAST) */ #define GZ_OS 9 /* Operating System */ #define GZ_XLEN 10 /* eXtra LENgth (16bit) */ #define GZ_FEXTRA_START 12 /* Start of extra fields */ #define GZ_SI1 12 /* Subfield ID1 */ #define GZ_SI2 13 /* Subfield ID2 */ #define GZ_SUBLEN 14 /* Subfield length (16bit) */ #define GZ_VERSION 16 /* Version for subfield format */ #define GZ_CHUNKLEN 18 /* Chunk length (16bit) */ #define GZ_CHUNKCNT 20 /* Number of chunks (16bit) */ #define GZ_RNDDATA 22 /* Random access data (16bit) */ #define DICT_UNKNOWN 0 #define DICT_TEXT 1 #define DICT_GZIP 2 #define DICT_DZIP 3 int DictData::read_header(const std::string &fname, int computeCRC) { FILE *str; int id1, id2, si1, si2; char buffer[BUFFERSIZE]; int extraLength, subLength; int i; char *pt; int c; struct stat sb; unsigned long crc = crc32( 0L, Z_NULL, 0 ); int count; unsigned long offset; if (!(str = fopen(fname.c_str(), "rb"))) { //err_fatal_errno( __FUNCTION__, // "Cannot open data file \"%s\" for read\n", filename ); return -1; } this->headerLength = GZ_XLEN - 1; this->type = DICT_UNKNOWN; id1 = getc( str ); id2 = getc( str ); if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) { this->type = DICT_TEXT; fstat( fileno( str ), &sb ); this->compressedLength = this->length = sb.st_size; this->origFilename = fname; this->mtime = sb.st_mtime; if (computeCRC) { rewind( str ); while (!feof( str )) { if ((count = fread( buffer, 1, BUFFERSIZE, str ))) { crc = crc32(crc, (Bytef *)buffer, count); } } } this->crc = crc; fclose( str ); return 0; } this->type = DICT_GZIP; this->method = getc( str ); this->flags = getc( str ); this->mtime = getc( str ) << 0; this->mtime |= getc( str ) << 8; this->mtime |= getc( str ) << 16; this->mtime |= getc( str ) << 24; this->extraFlags = getc( str ); this->os = getc( str ); if (this->flags & GZ_FEXTRA) { extraLength = getc( str ) << 0; extraLength |= getc( str ) << 8; this->headerLength += extraLength + 2; si1 = getc( str ); si2 = getc( str ); if (si1 == GZ_RND_S1 || si2 == GZ_RND_S2) { subLength = getc( str ) << 0; subLength |= getc( str ) << 8; this->version = getc( str ) << 0; this->version |= getc( str ) << 8; if (this->version != 1) { //err_internal( __FUNCTION__, // "dzip header version %d not supported\n", // this->version ); } this->chunkLength = getc( str ) << 0; this->chunkLength |= getc( str ) << 8; this->chunkCount = getc( str ) << 0; this->chunkCount |= getc( str ) << 8; if (this->chunkCount <= 0) { fclose( str ); return 5; } this->chunks = (int *)malloc(sizeof( this->chunks[0] ) * this->chunkCount ); for (i = 0; i < this->chunkCount; i++) { this->chunks[i] = getc( str ) << 0; this->chunks[i] |= getc( str ) << 8; } this->type = DICT_DZIP; } else { fseek( str, this->headerLength, SEEK_SET ); } } if (this->flags & GZ_FNAME) { /* FIXME! Add checking against header len */ pt = buffer; while ((c = getc( str )) && c != EOF) *pt++ = c; *pt = '\0'; this->origFilename = buffer; this->headerLength += this->origFilename.length() + 1; } else { this->origFilename = ""; } if (this->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */ pt = buffer; while ((c = getc( str )) && c != EOF) *pt++ = c; *pt = '\0'; comment = buffer; headerLength += comment.length()+1; } else { comment = ""; } if (this->flags & GZ_FHCRC) { getc( str ); getc( str ); this->headerLength += 2; } if (ftell( str ) != this->headerLength + 1) { //err_internal( __FUNCTION__, // "File position (%lu) != header length + 1 (%d)\n", // ftell( str ), this->headerLength + 1 ); } fseek( str, -8, SEEK_END ); this->crc = getc( str ) << 0; this->crc |= getc( str ) << 8; this->crc |= getc( str ) << 16; this->crc |= getc( str ) << 24; this->length = getc( str ) << 0; this->length |= getc( str ) << 8; this->length |= getc( str ) << 16; this->length |= getc( str ) << 24; this->compressedLength = ftell( str ); /* Compute offsets */ this->offsets = (unsigned long *)malloc( sizeof( this->offsets[0] ) * this->chunkCount ); for (offset = this->headerLength + 1, i = 0; i < this->chunkCount; i++) { this->offsets[i] = offset; offset += this->chunks[i]; } fclose( str ); return 0; } bool DictData::open(const std::string& fname, int computeCRC) { struct stat sb; int fd; this->initialized = 0; if (stat(fname.c_str(), &sb) || !S_ISREG(sb.st_mode)) { //err_warning( __FUNCTION__, // "%s is not a regular file -- ignoring\n", fname ); return false; } if (read_header(fname, computeCRC)) { //err_fatal( __FUNCTION__, // "\"%s\" not in text or dzip format\n", fname ); return false; } if ((fd = ::open(fname.c_str(), O_RDONLY )) < 0) { //err_fatal_errno( __FUNCTION__, // "Cannot open data file \"%s\"\n", fname ); return false; } if (fstat(fd, &sb)) { //err_fatal_errno( __FUNCTION__, // "Cannot stat data file \"%s\"\n", fname ); return false; } this->size = sb.st_size; ::close(fd); if (!mapfile.open(fname.c_str(), size)) return false; this->start=mapfile.begin(); this->end = this->start + this->size; for (size_t j = 0; j < DICT_CACHE_SIZE; j++) { cache[j].chunk = -1; cache[j].stamp = -1; cache[j].inBuffer = nullptr; cache[j].count = 0; } return true; } void DictData::close() { if (this->chunks) free(this->chunks); if (this->offsets) free(this->offsets); if (this->initialized) { if (inflateEnd( &this->zStream )) { //err_internal( __FUNCTION__, // "Cannot shut down inflation engine: %s\n", // this->zStream.msg ); } } for (size_t i = 0; i < DICT_CACHE_SIZE; ++i){ if (this -> cache [i].inBuffer) free (this -> cache [i].inBuffer); } } void DictData::read(char *buffer, unsigned long start, unsigned long size) { char *pt; unsigned long end; int count; char *inBuffer; char outBuffer[OUT_BUFFER_SIZE]; int firstChunk, lastChunk; int firstOffset, lastOffset; int i; int found, target, lastStamp; static int stamp = 0; end = start + size; //buffer = malloc( size + 1 ); //PRINTF(DBG_UNZIP, // ("dict_data_read( %p, %lu, %lu )\n", //h, start, size )); switch (this->type) { case DICT_GZIP: //err_fatal( __FUNCTION__, // "Cannot seek on pure gzip format files.\n" // "Use plain text (for performance)" // " or dzip format (for space savings).\n" ); break; case DICT_TEXT: memcpy( buffer, this->start + start, size ); //buffer[size] = '\0'; break; case DICT_DZIP: if (!this->initialized) { ++this->initialized; this->zStream.zalloc = nullptr; this->zStream.zfree = nullptr; this->zStream.opaque = nullptr; this->zStream.next_in = 0; this->zStream.avail_in = 0; this->zStream.next_out = nullptr; this->zStream.avail_out = 0; if (inflateInit2( &this->zStream, -15 ) != Z_OK) { //err_internal( __FUNCTION__, // "Cannot initialize inflation engine: %s\n", //this->zStream.msg ); } } firstChunk = start / this->chunkLength; firstOffset = start - firstChunk * this->chunkLength; lastChunk = end / this->chunkLength; lastOffset = end - lastChunk * this->chunkLength; //PRINTF(DBG_UNZIP, // (" start = %lu, end = %lu\n" //"firstChunk = %d, firstOffset = %d," //" lastChunk = %d, lastOffset = %d\n", //start, end, firstChunk, firstOffset, lastChunk, lastOffset )); for (pt = buffer, i = firstChunk; i <= lastChunk; i++) { /* Access cache */ found = 0; target = 0; lastStamp = INT_MAX; for (size_t j = 0; j < DICT_CACHE_SIZE; j++) { #if USE_CACHE if (this->cache[j].chunk == i) { found = 1; target = j; break; } #endif if (this->cache[j].stamp < lastStamp) { lastStamp = this->cache[j].stamp; target = j; } } this->cache[target].stamp = ++stamp; if (found) { count = this->cache[target].count; inBuffer = this->cache[target].inBuffer; } else { this->cache[target].chunk = i; if (!this->cache[target].inBuffer) this->cache[target].inBuffer = (char *)malloc( IN_BUFFER_SIZE ); inBuffer = this->cache[target].inBuffer; if (this->chunks[i] >= OUT_BUFFER_SIZE ) { //err_internal( __FUNCTION__, // "this->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n", // i, this->chunks[i], OUT_BUFFER_SIZE ); } memcpy( outBuffer, this->start + this->offsets[i], this->chunks[i] ); this->zStream.next_in = (Bytef *)outBuffer; this->zStream.avail_in = this->chunks[i]; this->zStream.next_out = (Bytef *)inBuffer; this->zStream.avail_out = IN_BUFFER_SIZE; if (inflate( &this->zStream, Z_PARTIAL_FLUSH ) != Z_OK) { //err_fatal( __FUNCTION__, "inflate: %s\n", this->zStream.msg ); } if (this->zStream.avail_in) { //err_internal( __FUNCTION__, // "inflate did not flush (%d pending, %d avail)\n", // this->zStream.avail_in, this->zStream.avail_out ); } count = IN_BUFFER_SIZE - this->zStream.avail_out; this->cache[target].count = count; } if (i == firstChunk) { if (i == lastChunk) { memcpy( pt, inBuffer + firstOffset, lastOffset-firstOffset); pt += lastOffset - firstOffset; } else { if (count != this->chunkLength ) { //err_internal( __FUNCTION__, // "Length = %d instead of %d\n", //count, this->chunkLength ); } memcpy( pt, inBuffer + firstOffset, this->chunkLength - firstOffset ); pt += this->chunkLength - firstOffset; } } else if (i == lastChunk) { memcpy( pt, inBuffer, lastOffset ); pt += lastOffset; } else { assert( count == this->chunkLength ); memcpy( pt, inBuffer, this->chunkLength ); pt += this->chunkLength; } } //*pt = '\0'; break; case DICT_UNKNOWN: //err_fatal( __FUNCTION__, "Cannot read unknown file type\n" ); break; } }