mirror of
https://github.com/Dushistov/sdcv.git
synced 2025-12-15 17:31:56 +00:00
480 lines
16 KiB
C++
480 lines
16 KiB
C++
/* dictziplib.c --
|
|
* http://stardict.sourceforge.net
|
|
* Copyright (C) 2003-2003 Hu Zheng <huzheng_001@163.com>
|
|
* This file is a modify version of dictd-1.9.7's data.c
|
|
*
|
|
* data.c --
|
|
* Created: Tue Jul 16 12:45:41 1996 by faith@dict.org
|
|
* Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org
|
|
* Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
|
|
*
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
//#define HAVE_MMAP //it will defined in config.h. this can be done by configure.in with a AC_FUNC_MMAP.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <fcntl.h>
|
|
#include <limits.h>
|
|
#include <unistd.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include "dictziplib.hpp"
|
|
|
|
#define USE_CACHE 1
|
|
|
|
#define BUFFERSIZE 10240
|
|
|
|
/*
|
|
* Output buffer must be greater than or
|
|
* equal to 110% of input buffer size, plus
|
|
* 12 bytes.
|
|
*/
|
|
#define OUT_BUFFER_SIZE 0xffffL
|
|
|
|
#define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89))
|
|
|
|
/* For gzip-compatible header, as defined in RFC 1952 */
|
|
|
|
/* Magic for GZIP (rfc1952) */
|
|
#define GZ_MAGIC1 0x1f /* First magic byte */
|
|
#define GZ_MAGIC2 0x8b /* Second magic byte */
|
|
|
|
/* FLaGs (bitmapped), from rfc1952 */
|
|
#define GZ_FTEXT 0x01 /* Set for ASCII text */
|
|
#define GZ_FHCRC 0x02 /* Header CRC16 */
|
|
#define GZ_FEXTRA 0x04 /* Optional field (random access index) */
|
|
#define GZ_FNAME 0x08 /* Original name */
|
|
#define GZ_COMMENT 0x10 /* Zero-terminated, human-readable comment */
|
|
#define GZ_MAX 2 /* Maximum compression */
|
|
#define GZ_FAST 4 /* Fasted compression */
|
|
|
|
/* These are from rfc1952 */
|
|
#define GZ_OS_FAT 0 /* FAT filesystem (MS-DOS, OS/2, NT/Win32) */
|
|
#define GZ_OS_AMIGA 1 /* Amiga */
|
|
#define GZ_OS_VMS 2 /* VMS (or OpenVMS) */
|
|
#define GZ_OS_UNIX 3 /* Unix */
|
|
#define GZ_OS_VMCMS 4 /* VM/CMS */
|
|
#define GZ_OS_ATARI 5 /* Atari TOS */
|
|
#define GZ_OS_HPFS 6 /* HPFS filesystem (OS/2, NT) */
|
|
#define GZ_OS_MAC 7 /* Macintosh */
|
|
#define GZ_OS_Z 8 /* Z-System */
|
|
#define GZ_OS_CPM 9 /* CP/M */
|
|
#define GZ_OS_TOPS20 10 /* TOPS-20 */
|
|
#define GZ_OS_NTFS 11 /* NTFS filesystem (NT) */
|
|
#define GZ_OS_QDOS 12 /* QDOS */
|
|
#define GZ_OS_ACORN 13 /* Acorn RISCOS */
|
|
#define GZ_OS_UNKNOWN 255 /* unknown */
|
|
|
|
#define GZ_RND_S1 'R' /* First magic for random access format */
|
|
#define GZ_RND_S2 'A' /* Second magic for random access format */
|
|
|
|
#define GZ_ID1 0 /* GZ_MAGIC1 */
|
|
#define GZ_ID2 1 /* GZ_MAGIC2 */
|
|
#define GZ_CM 2 /* Compression Method (Z_DEFALTED) */
|
|
#define GZ_FLG 3 /* FLaGs (see above) */
|
|
#define GZ_MTIME 4 /* Modification TIME */
|
|
#define GZ_XFL 8 /* eXtra FLags (GZ_MAX or GZ_FAST) */
|
|
#define GZ_OS 9 /* Operating System */
|
|
#define GZ_XLEN 10 /* eXtra LENgth (16bit) */
|
|
#define GZ_FEXTRA_START 12 /* Start of extra fields */
|
|
#define GZ_SI1 12 /* Subfield ID1 */
|
|
#define GZ_SI2 13 /* Subfield ID2 */
|
|
#define GZ_SUBLEN 14 /* Subfield length (16bit) */
|
|
#define GZ_VERSION 16 /* Version for subfield format */
|
|
#define GZ_CHUNKLEN 18 /* Chunk length (16bit) */
|
|
#define GZ_CHUNKCNT 20 /* Number of chunks (16bit) */
|
|
#define GZ_RNDDATA 22 /* Random access data (16bit) */
|
|
|
|
#define DICT_UNKNOWN 0
|
|
#define DICT_TEXT 1
|
|
#define DICT_GZIP 2
|
|
#define DICT_DZIP 3
|
|
|
|
int DictData::read_header(const std::string &fname, int computeCRC)
|
|
{
|
|
FILE *str;
|
|
int id1, id2, si1, si2;
|
|
char buffer[BUFFERSIZE];
|
|
int extraLength, subLength;
|
|
int i;
|
|
char *pt;
|
|
int c;
|
|
struct stat sb;
|
|
unsigned long crc = crc32(0L, Z_NULL, 0);
|
|
int count;
|
|
unsigned long offset;
|
|
|
|
if (!(str = fopen(fname.c_str(), "rb"))) {
|
|
//err_fatal_errno( __FUNCTION__,
|
|
// "Cannot open data file \"%s\" for read\n", filename );
|
|
return -1;
|
|
}
|
|
|
|
this->headerLength = GZ_XLEN - 1;
|
|
this->type = DICT_UNKNOWN;
|
|
|
|
id1 = getc(str);
|
|
id2 = getc(str);
|
|
|
|
if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) {
|
|
this->type = DICT_TEXT;
|
|
fstat(fileno(str), &sb);
|
|
this->compressedLength = this->length = sb.st_size;
|
|
this->origFilename = fname;
|
|
this->mtime = sb.st_mtime;
|
|
if (computeCRC) {
|
|
rewind(str);
|
|
while (!feof(str)) {
|
|
if ((count = fread(buffer, 1, BUFFERSIZE, str))) {
|
|
crc = crc32(crc, (Bytef *)buffer, count);
|
|
}
|
|
}
|
|
}
|
|
this->crc = crc;
|
|
fclose(str);
|
|
return 0;
|
|
}
|
|
this->type = DICT_GZIP;
|
|
|
|
this->method = getc(str);
|
|
this->flags = getc(str);
|
|
this->mtime = getc(str) << 0;
|
|
this->mtime |= getc(str) << 8;
|
|
this->mtime |= getc(str) << 16;
|
|
this->mtime |= getc(str) << 24;
|
|
this->extraFlags = getc(str);
|
|
this->os = getc(str);
|
|
|
|
if (this->flags & GZ_FEXTRA) {
|
|
extraLength = getc(str) << 0;
|
|
extraLength |= getc(str) << 8;
|
|
this->headerLength += extraLength + 2;
|
|
si1 = getc(str);
|
|
si2 = getc(str);
|
|
|
|
if (si1 == GZ_RND_S1 || si2 == GZ_RND_S2) {
|
|
subLength = getc(str) << 0;
|
|
subLength |= getc(str) << 8;
|
|
this->version = getc(str) << 0;
|
|
this->version |= getc(str) << 8;
|
|
|
|
if (this->version != 1) {
|
|
//err_internal( __FUNCTION__,
|
|
// "dzip header version %d not supported\n",
|
|
// this->version );
|
|
}
|
|
|
|
this->chunkLength = getc(str) << 0;
|
|
this->chunkLength |= getc(str) << 8;
|
|
this->chunkCount = getc(str) << 0;
|
|
this->chunkCount |= getc(str) << 8;
|
|
|
|
if (this->chunkCount <= 0) {
|
|
fclose(str);
|
|
return 5;
|
|
}
|
|
this->chunks = (int *)malloc(sizeof(this->chunks[0])
|
|
* this->chunkCount);
|
|
for (i = 0; i < this->chunkCount; i++) {
|
|
this->chunks[i] = getc(str) << 0;
|
|
this->chunks[i] |= getc(str) << 8;
|
|
}
|
|
this->type = DICT_DZIP;
|
|
} else {
|
|
fseek(str, this->headerLength, SEEK_SET);
|
|
}
|
|
}
|
|
|
|
if (this->flags & GZ_FNAME) { /* FIXME! Add checking against header len */
|
|
pt = buffer;
|
|
while ((c = getc(str)) && c != EOF)
|
|
*pt++ = c;
|
|
*pt = '\0';
|
|
|
|
this->origFilename = buffer;
|
|
this->headerLength += this->origFilename.length() + 1;
|
|
} else {
|
|
this->origFilename = "";
|
|
}
|
|
|
|
if (this->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */
|
|
pt = buffer;
|
|
while ((c = getc(str)) && c != EOF)
|
|
*pt++ = c;
|
|
*pt = '\0';
|
|
comment = buffer;
|
|
headerLength += comment.length() + 1;
|
|
} else {
|
|
comment = "";
|
|
}
|
|
|
|
if (this->flags & GZ_FHCRC) {
|
|
getc(str);
|
|
getc(str);
|
|
this->headerLength += 2;
|
|
}
|
|
|
|
if (ftell(str) != this->headerLength + 1) {
|
|
//err_internal( __FUNCTION__,
|
|
// "File position (%lu) != header length + 1 (%d)\n",
|
|
// ftell( str ), this->headerLength + 1 );
|
|
}
|
|
|
|
fseek(str, -8, SEEK_END);
|
|
this->crc = getc(str) << 0;
|
|
this->crc |= getc(str) << 8;
|
|
this->crc |= getc(str) << 16;
|
|
this->crc |= getc(str) << 24;
|
|
this->length = getc(str) << 0;
|
|
this->length |= getc(str) << 8;
|
|
this->length |= getc(str) << 16;
|
|
this->length |= getc(str) << 24;
|
|
this->compressedLength = ftell(str);
|
|
|
|
/* Compute offsets */
|
|
this->offsets = (unsigned long *)malloc(sizeof(this->offsets[0])
|
|
* this->chunkCount);
|
|
for (offset = this->headerLength + 1, i = 0;
|
|
i < this->chunkCount;
|
|
i++) {
|
|
this->offsets[i] = offset;
|
|
offset += this->chunks[i];
|
|
}
|
|
|
|
fclose(str);
|
|
return 0;
|
|
}
|
|
|
|
bool DictData::open(const std::string &fname, int computeCRC)
|
|
{
|
|
struct stat sb;
|
|
int fd;
|
|
|
|
this->initialized = 0;
|
|
|
|
if (stat(fname.c_str(), &sb) || !S_ISREG(sb.st_mode)) {
|
|
//err_warning( __FUNCTION__,
|
|
// "%s is not a regular file -- ignoring\n", fname );
|
|
return false;
|
|
}
|
|
|
|
if (read_header(fname, computeCRC)) {
|
|
//err_fatal( __FUNCTION__,
|
|
// "\"%s\" not in text or dzip format\n", fname );
|
|
return false;
|
|
}
|
|
|
|
if ((fd = ::open(fname.c_str(), O_RDONLY)) < 0) {
|
|
//err_fatal_errno( __FUNCTION__,
|
|
// "Cannot open data file \"%s\"\n", fname );
|
|
return false;
|
|
}
|
|
if (fstat(fd, &sb)) {
|
|
//err_fatal_errno( __FUNCTION__,
|
|
// "Cannot stat data file \"%s\"\n", fname );
|
|
return false;
|
|
}
|
|
|
|
this->size = sb.st_size;
|
|
::close(fd);
|
|
if (!mapfile.open(fname.c_str(), size))
|
|
return false;
|
|
|
|
this->start = mapfile.begin();
|
|
this->end = this->start + this->size;
|
|
|
|
for (size_t j = 0; j < DICT_CACHE_SIZE; j++) {
|
|
cache[j].chunk = -1;
|
|
cache[j].stamp = -1;
|
|
cache[j].inBuffer = nullptr;
|
|
cache[j].count = 0;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void DictData::close()
|
|
{
|
|
if (this->chunks)
|
|
free(this->chunks);
|
|
if (this->offsets)
|
|
free(this->offsets);
|
|
|
|
if (this->initialized) {
|
|
if (inflateEnd(&this->zStream)) {
|
|
//err_internal( __FUNCTION__,
|
|
// "Cannot shut down inflation engine: %s\n",
|
|
// this->zStream.msg );
|
|
}
|
|
}
|
|
|
|
for (size_t i = 0; i < DICT_CACHE_SIZE; ++i) {
|
|
if (this->cache[i].inBuffer)
|
|
free(this->cache[i].inBuffer);
|
|
}
|
|
}
|
|
|
|
void DictData::read(char *buffer, unsigned long start, unsigned long size)
|
|
{
|
|
char *pt;
|
|
unsigned long end;
|
|
int count;
|
|
char *inBuffer;
|
|
char outBuffer[OUT_BUFFER_SIZE];
|
|
int firstChunk, lastChunk;
|
|
int firstOffset, lastOffset;
|
|
int i;
|
|
int found, target, lastStamp;
|
|
static int stamp = 0;
|
|
|
|
end = start + size;
|
|
|
|
//buffer = malloc( size + 1 );
|
|
|
|
//PRINTF(DBG_UNZIP,
|
|
// ("dict_data_read( %p, %lu, %lu )\n",
|
|
//h, start, size ));
|
|
|
|
switch (this->type) {
|
|
case DICT_GZIP:
|
|
//err_fatal( __FUNCTION__,
|
|
// "Cannot seek on pure gzip format files.\n"
|
|
// "Use plain text (for performance)"
|
|
// " or dzip format (for space savings).\n" );
|
|
break;
|
|
case DICT_TEXT:
|
|
memcpy(buffer, this->start + start, size);
|
|
//buffer[size] = '\0';
|
|
break;
|
|
case DICT_DZIP:
|
|
if (!this->initialized) {
|
|
++this->initialized;
|
|
this->zStream.zalloc = nullptr;
|
|
this->zStream.zfree = nullptr;
|
|
this->zStream.opaque = nullptr;
|
|
this->zStream.next_in = 0;
|
|
this->zStream.avail_in = 0;
|
|
this->zStream.next_out = nullptr;
|
|
this->zStream.avail_out = 0;
|
|
if (inflateInit2(&this->zStream, -15) != Z_OK) {
|
|
//err_internal( __FUNCTION__,
|
|
// "Cannot initialize inflation engine: %s\n",
|
|
//this->zStream.msg );
|
|
}
|
|
}
|
|
firstChunk = start / this->chunkLength;
|
|
firstOffset = start - firstChunk * this->chunkLength;
|
|
lastChunk = end / this->chunkLength;
|
|
lastOffset = end - lastChunk * this->chunkLength;
|
|
//PRINTF(DBG_UNZIP,
|
|
// (" start = %lu, end = %lu\n"
|
|
//"firstChunk = %d, firstOffset = %d,"
|
|
//" lastChunk = %d, lastOffset = %d\n",
|
|
//start, end, firstChunk, firstOffset, lastChunk, lastOffset ));
|
|
for (pt = buffer, i = firstChunk; i <= lastChunk; i++) {
|
|
|
|
/* Access cache */
|
|
found = 0;
|
|
target = 0;
|
|
lastStamp = INT_MAX;
|
|
for (size_t j = 0; j < DICT_CACHE_SIZE; j++) {
|
|
#if USE_CACHE
|
|
if (this->cache[j].chunk == i) {
|
|
found = 1;
|
|
target = j;
|
|
break;
|
|
}
|
|
#endif
|
|
if (this->cache[j].stamp < lastStamp) {
|
|
lastStamp = this->cache[j].stamp;
|
|
target = j;
|
|
}
|
|
}
|
|
|
|
this->cache[target].stamp = ++stamp;
|
|
if (found) {
|
|
count = this->cache[target].count;
|
|
inBuffer = this->cache[target].inBuffer;
|
|
} else {
|
|
this->cache[target].chunk = i;
|
|
if (!this->cache[target].inBuffer)
|
|
this->cache[target].inBuffer = (char *)malloc(IN_BUFFER_SIZE);
|
|
inBuffer = this->cache[target].inBuffer;
|
|
|
|
if (this->chunks[i] >= OUT_BUFFER_SIZE) {
|
|
//err_internal( __FUNCTION__,
|
|
// "this->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n",
|
|
// i, this->chunks[i], OUT_BUFFER_SIZE );
|
|
}
|
|
memcpy(outBuffer, this->start + this->offsets[i], this->chunks[i]);
|
|
|
|
this->zStream.next_in = (Bytef *)outBuffer;
|
|
this->zStream.avail_in = this->chunks[i];
|
|
this->zStream.next_out = (Bytef *)inBuffer;
|
|
this->zStream.avail_out = IN_BUFFER_SIZE;
|
|
if (inflate(&this->zStream, Z_PARTIAL_FLUSH) != Z_OK) {
|
|
//err_fatal( __FUNCTION__, "inflate: %s\n", this->zStream.msg );
|
|
}
|
|
if (this->zStream.avail_in) {
|
|
//err_internal( __FUNCTION__,
|
|
// "inflate did not flush (%d pending, %d avail)\n",
|
|
// this->zStream.avail_in, this->zStream.avail_out );
|
|
}
|
|
|
|
count = IN_BUFFER_SIZE - this->zStream.avail_out;
|
|
|
|
this->cache[target].count = count;
|
|
}
|
|
|
|
if (i == firstChunk) {
|
|
if (i == lastChunk) {
|
|
memcpy(pt, inBuffer + firstOffset, lastOffset - firstOffset);
|
|
pt += lastOffset - firstOffset;
|
|
} else {
|
|
if (count != this->chunkLength) {
|
|
//err_internal( __FUNCTION__,
|
|
// "Length = %d instead of %d\n",
|
|
//count, this->chunkLength );
|
|
}
|
|
memcpy(pt, inBuffer + firstOffset,
|
|
this->chunkLength - firstOffset);
|
|
pt += this->chunkLength - firstOffset;
|
|
}
|
|
} else if (i == lastChunk) {
|
|
memcpy(pt, inBuffer, lastOffset);
|
|
pt += lastOffset;
|
|
} else {
|
|
assert(count == this->chunkLength);
|
|
memcpy(pt, inBuffer, this->chunkLength);
|
|
pt += this->chunkLength;
|
|
}
|
|
}
|
|
//*pt = '\0';
|
|
break;
|
|
case DICT_UNKNOWN:
|
|
//err_fatal( __FUNCTION__, "Cannot read unknown file type\n" );
|
|
break;
|
|
}
|
|
}
|