#ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "distance.hpp" #include "mapfile.hpp" #include "utils.hpp" #include "stardict_lib.hpp" #define TO_STR2(xstr) #xstr #define TO_STR1(xstr) TO_STR2(xstr) #define THROW_IF_ERROR(expr) \ do { \ assert((expr)); \ if (!(expr)) \ throw std::runtime_error(#expr " not true at " __FILE__ ": " TO_STR1(__LINE__)); \ } while (false) // Notice: read src/tools/DICTFILE_FORMAT for the dictionary // file's format information! namespace { struct Fuzzystruct { char *pMatchWord; int iMatchWordDistance; }; static inline bool bIsVowel(gchar inputchar) { gchar ch = g_ascii_toupper(inputchar); return (ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U'); } static bool bIsPureEnglish(const gchar *str) { // i think this should work even when it is UTF8 string :). for (int i = 0; str[i] != 0; i++) // if(str[i]<0) // if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK. // Better use isascii() but not str[i]<0 while char is default unsigned in arm if (!isascii(str[i])) return false; return true; } static inline gint stardict_strcmp(const gchar *s1, const gchar *s2) { const gint a = g_ascii_strcasecmp(s1, s2); if (a == 0) return strcmp(s1, s2); else return a; } static void unicode_strdown(gunichar *str) { while (*str) { *str = g_unichar_tolower(*str); ++str; } } } // namespace bool DictInfo::load_from_ifo_file(const std::string &ifofilename, bool istreedict) { ifo_file_name = ifofilename; glib::CharStr buffer; if (!g_file_get_contents(ifofilename.c_str(), get_addr(buffer), nullptr, nullptr)) return false; static const char TREEDICT_MAGIC_DATA[] = "StarDict's treedict ifo file"; static const char DICT_MAGIC_DATA[] = "StarDict's dict ifo file"; const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA; static const unsigned char utf8_bom[] = { 0xEF, 0xBB, 0xBF, '\0' }; if (!g_str_has_prefix( g_str_has_prefix(get_impl(buffer), (const gchar *)(utf8_bom)) ? get_impl(buffer) + 3 : get_impl(buffer), magic_data)) { return false; } gchar *p1 = get_impl(buffer) + strlen(magic_data) - 1; gchar *p2 = strstr(p1, "\nwordcount="); if (p2 == nullptr) return false; gchar *p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n'); wordcount = atol(std::string(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1)).c_str()); if (istreedict) { p2 = strstr(p1, "\ntdxfilesize="); if (p2 == nullptr) return false; p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n'); index_file_size = atol(std::string(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1)).c_str()); } else { p2 = strstr(p1, "\nidxfilesize="); if (p2 == nullptr) return false; p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n'); index_file_size = atol(std::string(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1)).c_str()); } p2 = strstr(p1, "\nbookname="); if (p2 == nullptr) return false; p2 = p2 + sizeof("\nbookname=") - 1; p3 = strchr(p2, '\n'); bookname.assign(p2, p3 - p2); p2 = strstr(p1, "\nauthor="); if (p2) { p2 = p2 + sizeof("\nauthor=") - 1; p3 = strchr(p2, '\n'); author.assign(p2, p3 - p2); } p2 = strstr(p1, "\nemail="); if (p2) { p2 = p2 + sizeof("\nemail=") - 1; p3 = strchr(p2, '\n'); email.assign(p2, p3 - p2); } p2 = strstr(p1, "\nwebsite="); if (p2) { p2 = p2 + sizeof("\nwebsite=") - 1; p3 = strchr(p2, '\n'); website.assign(p2, p3 - p2); } p2 = strstr(p1, "\ndate="); if (p2) { p2 = p2 + sizeof("\ndate=") - 1; p3 = strchr(p2, '\n'); date.assign(p2, p3 - p2); } p2 = strstr(p1, "\ndescription="); if (p2) { p2 = p2 + sizeof("\ndescription=") - 1; p3 = strchr(p2, '\n'); description.assign(p2, p3 - p2); } p2 = strstr(p1, "\nsametypesequence="); if (p2) { p2 += sizeof("\nsametypesequence=") - 1; p3 = strchr(p2, '\n'); sametypesequence.assign(p2, p3 - p2); } p2 = strstr(p1, "\nsynwordcount="); syn_wordcount = 0; if (p2) { p2 += sizeof("\nsynwordcount=") - 1; p3 = strchr(p2, '\n'); syn_wordcount = atol(std::string(p2, p3 - p2).c_str()); } return true; } gchar *DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size) { for (int i = 0; i < WORDDATA_CACHE_NUM; i++) if (cache[i].data && cache[i].offset == idxitem_offset) return cache[i].data; if (dictfile) fseek(dictfile, idxitem_offset, SEEK_SET); gchar *data; if (!sametypesequence.empty()) { glib::CharStr origin_data((gchar *)g_malloc(idxitem_size)); if (dictfile) { const size_t nitems = fread(get_impl(origin_data), idxitem_size, 1, dictfile); THROW_IF_ERROR(nitems == 1); } else dictdzfile->read(get_impl(origin_data), idxitem_offset, idxitem_size); guint32 data_size; gint sametypesequence_len = sametypesequence.length(); // there have sametypesequence_len char being omitted. data_size = idxitem_size + sizeof(guint32) + sametypesequence_len; // if the last item's size is determined by the end up '\0',then +=sizeof(gchar); // if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32); switch (sametypesequence[sametypesequence_len - 1]) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': data_size += sizeof(gchar); break; case 'W': case 'P': data_size += sizeof(guint32); break; default: if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) data_size += sizeof(guint32); else data_size += sizeof(gchar); break; } data = (gchar *)g_malloc(data_size); gchar *p1, *p2; p1 = data + sizeof(guint32); p2 = get_impl(origin_data); guint32 sec_size; // copy the head items. for (int i = 0; i < sametypesequence_len - 1; i++) { *p1 = sametypesequence[i]; p1 += sizeof(gchar); switch (sametypesequence[i]) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': sec_size = strlen(p2) + 1; memcpy(p1, p2, sec_size); p1 += sec_size; p2 += sec_size; break; case 'W': case 'P': sec_size = get_uint32(p2); sec_size += sizeof(guint32); memcpy(p1, p2, sec_size); p1 += sec_size; p2 += sec_size; break; default: if (g_ascii_isupper(sametypesequence[i])) { sec_size = get_uint32(p2); sec_size += sizeof(guint32); } else { sec_size = strlen(p2) + 1; } memcpy(p1, p2, sec_size); p1 += sec_size; p2 += sec_size; break; } } // calculate the last item 's size. sec_size = idxitem_size - (p2 - get_impl(origin_data)); *p1 = sametypesequence[sametypesequence_len - 1]; p1 += sizeof(gchar); switch (sametypesequence[sametypesequence_len - 1]) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': memcpy(p1, p2, sec_size); p1 += sec_size; *p1 = '\0'; // add the end up '\0'; break; case 'W': case 'P': set_uint32(p1, sec_size); p1 += sizeof(guint32); memcpy(p1, p2, sec_size); break; default: if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) { set_uint32(p1, sec_size); p1 += sizeof(guint32); memcpy(p1, p2, sec_size); } else { memcpy(p1, p2, sec_size); p1 += sec_size; *p1 = '\0'; } break; } set_uint32(data, data_size); } else { data = (gchar *)g_malloc(idxitem_size + sizeof(guint32)); if (dictfile) { const size_t nitems = fread(data + sizeof(guint32), idxitem_size, 1, dictfile); THROW_IF_ERROR(nitems == 1); } else dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size); set_uint32(data, idxitem_size + sizeof(guint32)); } g_free(cache[cache_cur].data); cache[cache_cur].data = data; cache[cache_cur].offset = idxitem_offset; cache_cur++; if (cache_cur == WORDDATA_CACHE_NUM) cache_cur = 0; return data; } bool DictBase::SearchData(std::vector &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data) { int nWord = SearchWords.size(); std::vector WordFind(nWord, false); int nfound = 0; if (dictfile) fseek(dictfile, idxitem_offset, SEEK_SET); THROW_IF_ERROR(origin_data != nullptr); if (dictfile) { const size_t nitems = fread(origin_data, idxitem_size, 1, dictfile); THROW_IF_ERROR(nitems == 1); } else dictdzfile->read(origin_data, idxitem_offset, idxitem_size); gchar *p = origin_data; guint32 sec_size; int j; if (!sametypesequence.empty()) { gint sametypesequence_len = sametypesequence.length(); for (int i = 0; i < sametypesequence_len - 1; i++) { switch (sametypesequence[i]) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': for (j = 0; j < nWord; j++) if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) { WordFind[j] = true; ++nfound; } if (nfound == nWord) return true; sec_size = strlen(p) + 1; p += sec_size; break; default: if (g_ascii_isupper(sametypesequence[i])) { sec_size = get_uint32(p); sec_size += sizeof(guint32); } else { sec_size = strlen(p) + 1; } p += sec_size; } } switch (sametypesequence[sametypesequence_len - 1]) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': sec_size = idxitem_size - (p - origin_data); for (j = 0; j < nWord; j++) if (!WordFind[j] && g_strstr_len(p, sec_size, SearchWords[j].c_str())) { WordFind[j] = true; ++nfound; } if (nfound == nWord) return true; break; } } else { while (guint32(p - origin_data) < idxitem_size) { switch (*p) { case 'm': case 't': case 'y': case 'l': case 'g': case 'x': case 'k': for (j = 0; j < nWord; j++) if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) { WordFind[j] = true; ++nfound; } if (nfound == nWord) return true; sec_size = strlen(p) + 1; p += sec_size; break; default: if (g_ascii_isupper(*p)) { sec_size = get_uint32(p); sec_size += sizeof(guint32); } else { sec_size = strlen(p) + 1; } p += sec_size; } } } return false; } namespace { class OffsetIndex : public IIndexFile { public: OffsetIndex() : idxfile(nullptr) { } ~OffsetIndex() { if (idxfile) fclose(idxfile); } bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override; const gchar *get_key(glong idx) override; void get_data(glong idx) override { get_key(idx); } const gchar *get_key_and_data(glong idx) override { return get_key(idx); } bool lookup(const char *str, std::set &idxs, glong &next_idx) override; private: static const gint ENTR_PER_PAGE = 32; static const char *CACHE_MAGIC; std::vector wordoffset; FILE *idxfile; gulong wordcount; gchar wordentry_buf[256 + sizeof(guint32) * 2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT. struct index_entry { glong idx; std::string keystr; void assign(glong i, const std::string &str) { idx = i; keystr.assign(str); } }; index_entry first, last, middle, real_last; struct page_entry { gchar *keystr; guint32 off, size; }; std::vector page_data; struct page_t { glong idx = -1; page_entry entries[ENTR_PER_PAGE]; page_t() {} void fill(gchar *data, gint nent, glong idx_); } page; gulong load_page(glong page_idx); const gchar *read_first_on_page_key(glong page_idx); const gchar *get_first_on_page_key(glong page_idx); bool load_cache(const std::string &url); bool save_cache(const std::string &url, bool verbose); static std::list get_cache_variant(const std::string &url); }; const char *OffsetIndex::CACHE_MAGIC = "StarDict's Cache, Version: 0.2"; #define CACHE_MAGIC_BYTES 0x51a4d1c1 class WordListIndex : public IIndexFile { public: WordListIndex() : idxdatabuf(nullptr) { } ~WordListIndex() { g_free(idxdatabuf); } bool load(const std::string &url, gulong wc, gulong fsize, bool verbose) override; const gchar *get_key(glong idx) override { return wordlist[idx]; } void get_data(glong idx) override; const gchar *get_key_and_data(glong idx) override { get_data(idx); return get_key(idx); } bool lookup(const char *str, std::set &idxs, glong &next_idx) override; private: gchar *idxdatabuf; std::vector wordlist; }; void OffsetIndex::page_t::fill(gchar *data, gint nent, glong idx_) { idx = idx_; gchar *p = data; glong len; for (gint i = 0; i < nent; ++i) { entries[i].keystr = p; len = strlen(p); p += len + 1; entries[i].off = g_ntohl(get_uint32(p)); p += sizeof(guint32); entries[i].size = g_ntohl(get_uint32(p)); p += sizeof(guint32); } } inline const gchar *OffsetIndex::read_first_on_page_key(glong page_idx) { fseek(idxfile, wordoffset[page_idx], SEEK_SET); guint32 page_size = wordoffset[page_idx + 1] - wordoffset[page_idx]; const size_t nitems = fread(wordentry_buf, std::min(sizeof(wordentry_buf), static_cast(page_size)), 1, idxfile); THROW_IF_ERROR(nitems == 1); // TODO: check returned values, deal with word entry that strlen>255. return wordentry_buf; } inline const gchar *OffsetIndex::get_first_on_page_key(glong page_idx) { if (page_idx < middle.idx) { if (page_idx == first.idx) return first.keystr.c_str(); return read_first_on_page_key(page_idx); } else if (page_idx > middle.idx) { if (page_idx == last.idx) return last.keystr.c_str(); return read_first_on_page_key(page_idx); } else return middle.keystr.c_str(); } bool OffsetIndex::load_cache(const std::string &url) { const std::list vars = get_cache_variant(url); for (const std::string &item : vars) { struct ::stat idxstat, cachestat; if (g_stat(url.c_str(), &idxstat) != 0 || g_stat(item.c_str(), &cachestat) != 0) continue; if (cachestat.st_mtime < idxstat.st_mtime) continue; MapFile mf; if (!mf.open(item.c_str(), cachestat.st_size)) continue; if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0) continue; guint32 tmp; memcpy(&tmp, mf.begin() + strlen(CACHE_MAGIC), sizeof(tmp)); if (tmp != CACHE_MAGIC_BYTES) continue; memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC) + sizeof(guint32), wordoffset.size() * sizeof(wordoffset[0])); return true; } return false; } std::list OffsetIndex::get_cache_variant(const std::string &url) { std::list res = { url + ".oft" }; if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) && g_mkdir(g_get_user_cache_dir(), 0700) == -1) return res; const std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv"; if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS)) { if (g_mkdir(cache_dir.c_str(), 0700) == -1) return res; } else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR)) return res; gchar *base = g_path_get_basename(url.c_str()); res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft"); g_free(base); return res; } bool OffsetIndex::save_cache(const std::string &url, bool verbose) { const std::list vars = get_cache_variant(url); for (const std::string &item : vars) { FILE *out = fopen(item.c_str(), "wb"); guint32 magic = CACHE_MAGIC_BYTES; if (!out) continue; if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC)) continue; if (fwrite(&magic, 1, sizeof(magic), out) != sizeof(magic)) continue; if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size()) continue; fclose(out); if (verbose) { printf("save to cache %s\n", url.c_str()); } return true; } return false; } bool OffsetIndex::load(const std::string &url, gulong wc, gulong fsize, bool verbose) { wordcount = wc; gulong npages = (wc - 1) / ENTR_PER_PAGE + 2; wordoffset.resize(npages); if (!load_cache(url)) { // map file will close after finish of block MapFile map_file; if (!map_file.open(url.c_str(), fsize)) return false; const gchar *idxdatabuffer = map_file.begin(); const gchar *p1 = idxdatabuffer; gulong index_size; guint32 j = 0; for (guint32 i = 0; i < wc; i++) { index_size = strlen(p1) + 1 + 2 * sizeof(guint32); if (i % ENTR_PER_PAGE == 0) { wordoffset[j] = p1 - idxdatabuffer; ++j; } p1 += index_size; } wordoffset[j] = p1 - idxdatabuffer; if (!save_cache(url, verbose)) fprintf(stderr, "cache update failed\n"); } if (!(idxfile = fopen(url.c_str(), "rb"))) { wordoffset.resize(0); return false; } first.assign(0, read_first_on_page_key(0)); last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2)); middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2)); real_last.assign(wc - 1, get_key(wc - 1)); return true; } inline gulong OffsetIndex::load_page(glong page_idx) { gulong nentr = ENTR_PER_PAGE; if (page_idx == glong(wordoffset.size() - 2)) if ((nentr = (wordcount % ENTR_PER_PAGE)) == 0) nentr = ENTR_PER_PAGE; if (page_idx != page.idx) { page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]); fseek(idxfile, wordoffset[page_idx], SEEK_SET); const size_t nitems = fread(&page_data[0], 1, page_data.size(), idxfile); THROW_IF_ERROR(nitems == page_data.size()); page.fill(&page_data[0], nentr, page_idx); } return nentr; } const gchar *OffsetIndex::get_key(glong idx) { load_page(idx / ENTR_PER_PAGE); glong idx_in_page = idx % ENTR_PER_PAGE; wordentry_offset = page.entries[idx_in_page].off; wordentry_size = page.entries[idx_in_page].size; return page.entries[idx_in_page].keystr; } bool OffsetIndex::lookup(const char *str, std::set &idxs, glong &next_idx) { bool bFound = false; if (stardict_strcmp(str, first.keystr.c_str()) < 0) { next_idx = 0; return false; } else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0) { next_idx = INVALID_INDEX; return false; } // Search for the first page where the word is likely to be located. glong iFrom = 0, iTo = wordoffset.size() - 2; glong iPage = 0, iThisIndex = 0; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; glong cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex)); if (cmpint > 0) iFrom = iThisIndex + 1; else if (cmpint < 0) iTo = iThisIndex - 1; else { bFound = true; break; } } if (bFound) { // We can use this found index (even though it might not be the first) // because we will search backwards later and catch any entries on // previous pages. iPage = iThisIndex; iThisIndex = 0; // first item in the page } else { iPage = iTo; // prev // Not found at the start of a page, so search within the page that // should contain it. Binary search here is slightly overkill (we're // searching at most ENTR_PER_PAGE = 32 elements) but this way next_idx // is treated the same as other Lookup methods. gulong netr = load_page(iPage); iFrom = 0; iTo = netr - 1; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; glong cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr); if (cmpint > 0) iFrom = iThisIndex + 1; else if (cmpint < 0) iTo = iThisIndex - 1; else { bFound = true; break; } } } if (!bFound) next_idx = iPage * ENTR_PER_PAGE + iFrom; // next else { // Convert the found in-page index to the dict index. iThisIndex = iPage * ENTR_PER_PAGE + iThisIndex; // In order to return all idxs that match the search string, walk // linearly behind and ahead of the found index. glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) idxs.insert(iHeadIndex--); do // no need to double-check iThisIndex -- we know it's a match already idxs.insert(iThisIndex++); while (iThisIndex <= real_last.idx && stardict_strcmp(str, get_key(iThisIndex)) == 0); } return bFound; } bool WordListIndex::load(const std::string &url, gulong wc, gulong fsize, bool) { gzFile in = gzopen(url.c_str(), "rb"); if (in == nullptr) return false; idxdatabuf = (gchar *)g_malloc(fsize); const int len = gzread(in, idxdatabuf, fsize); gzclose(in); if (len < 0) return false; if (gulong(len) != fsize) return false; wordlist.resize(wc + 1); gchar *p1 = idxdatabuf; guint32 i; for (i = 0; i < wc; i++) { wordlist[i] = p1; p1 += strlen(p1) + 1 + 2 * sizeof(guint32); } wordlist[wc] = p1; return true; } void WordListIndex::get_data(glong idx) { gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar); wordentry_offset = g_ntohl(get_uint32(p1)); p1 += sizeof(guint32); wordentry_size = g_ntohl(get_uint32(p1)); } bool WordListIndex::lookup(const char *str, std::set &idxs, glong &next_idx) { bool bFound = false; glong iLast = wordlist.size() - 2; if (stardict_strcmp(str, get_key(0)) < 0) { next_idx = 0; } else if (stardict_strcmp(str, get_key(iLast)) > 0) { next_idx = INVALID_INDEX; } else { glong iThisIndex = 0; glong iFrom = 0, iTo = iLast; gint cmpint; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; cmpint = stardict_strcmp(str, get_key(iThisIndex)); if (cmpint > 0) iFrom = iThisIndex + 1; else if (cmpint < 0) iTo = iThisIndex - 1; else { bFound = true; break; } } if (!bFound) next_idx = iFrom; // next else { // In order to return all idxs that match the search string, walk // linearly behind and ahead of the found index. glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) idxs.insert(iHeadIndex--); do // no need to double-check iThisIndex -- we know it's a match already idxs.insert(iThisIndex++); while (iThisIndex <= iLast && stardict_strcmp(str, get_key(iThisIndex)) == 0); } } return bFound; } } // namespace bool SynFile::load(const std::string &url, gulong wc) { struct stat stat_buf; if (!stat(url.c_str(), &stat_buf)) { if (!synfile.open(url.c_str(), stat_buf.st_size)) return false; synlist.resize(wc + 1); gchar *p1 = synfile.begin(); for (unsigned long i = 0; i < wc; i++) { // each entry in a syn-file is: // - 0-terminated string // 4-byte index into .dict file in network byte order synlist[i] = p1; p1 += strlen(p1) + 1 + 4; } synlist[wc] = p1; return true; } else { return false; } } bool SynFile::lookup(const char *str, std::set &idxs, glong &next_idx) { bool bFound = false; glong iLast = synlist.size() - 2; if (iLast < 0) return false; if (stardict_strcmp(str, get_key(0)) < 0) { next_idx = 0; } else if (stardict_strcmp(str, get_key(iLast)) > 0) { next_idx = INVALID_INDEX; } else { glong iThisIndex = 0; glong iFrom = 0, iTo = iLast; gint cmpint; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; cmpint = stardict_strcmp(str, get_key(iThisIndex)); if (cmpint > 0) iFrom = iThisIndex + 1; else if (cmpint < 0) iTo = iThisIndex - 1; else { bFound = true; break; } } if (!bFound) next_idx = iFrom; // next else { // In order to return all idxs that match the search string, walk // linearly behind and ahead of the found index. glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) { const gchar *key = get_key(iHeadIndex--); idxs.insert(g_ntohl(get_uint32(key + strlen(key) + 1))); } do { // no need to double-check iThisIndex -- we know it's a match already const gchar *key = get_key(iThisIndex++); idxs.insert(g_ntohl(get_uint32(key + strlen(key) + 1))); } while (iThisIndex <= iLast && stardict_strcmp(str, get_key(iThisIndex)) == 0); } } return bFound; } bool Dict::Lookup(const char *str, std::set &idxs, glong &next_idx) { bool found = false; found |= syn_file->lookup(str, idxs, next_idx); found |= idx_file->lookup(str, idxs, next_idx); return found; } bool Dict::load(const std::string &ifofilename, bool verbose) { gulong idxfilesize; if (!load_ifofile(ifofilename, idxfilesize)) return false; std::string fullfilename(ifofilename); fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz"); if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) { dictdzfile.reset(new DictData); if (!dictdzfile->open(fullfilename, 0)) { // g_print("open file %s failed!\n",fullfilename); return false; } } else { fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1); dictfile = fopen(fullfilename.c_str(), "rb"); if (!dictfile) { // g_print("open file %s failed!\n",fullfilename); return false; } } fullfilename = ifofilename; fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz"); if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) { idx_file.reset(new WordListIndex); } else { fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1); idx_file.reset(new OffsetIndex); } if (!idx_file->load(fullfilename, wordcount, idxfilesize, verbose)) return false; fullfilename = ifofilename; fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "syn"); syn_file.reset(new SynFile); syn_file->load(fullfilename, syn_wordcount); // g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles()); return true; } bool Dict::load_ifofile(const std::string &ifofilename, gulong &idxfilesize) { DictInfo dict_info; if (!dict_info.load_from_ifo_file(ifofilename, false)) return false; if (dict_info.wordcount == 0) return false; ifo_file_name = dict_info.ifo_file_name; wordcount = dict_info.wordcount; syn_wordcount = dict_info.syn_wordcount; bookname = dict_info.bookname; idxfilesize = dict_info.index_file_size; sametypesequence = dict_info.sametypesequence; return true; } bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen) { int iIndexCount = 0; for (guint32 i = 0; i < narticles() && iIndexCount < (iBuffLen - 1); i++) if (g_pattern_spec_match_string(pspec, get_key(i))) aIndex[iIndexCount++] = i; aIndex[iIndexCount] = -1; // -1 is the end. return iIndexCount > 0; } Libs::~Libs() { for (Dict *p : oLib) delete p; } void Libs::load_dict(const std::string &url) { Dict *lib = new Dict; if (lib->load(url, verbose_)) oLib.push_back(lib); else delete lib; } void Libs::load(const std::list &dicts_dirs, const std::list &order_list, const std::list &disable_list) { for_each_file(dicts_dirs, ".ifo", order_list, disable_list, [this](const std::string &url, bool disable) -> void { if (!disable) load_dict(url); }); } bool Libs::LookupSimilarWord(const gchar *sWord, std::set &iWordIndices, int iLib) { bool bFound = false; gchar *casestr; if (!bFound) { // to lower case. casestr = g_utf8_strdown(sWord, -1); if (strcmp(casestr, sWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); // to upper case. if (!bFound) { casestr = g_utf8_strup(sWord, -1); if (strcmp(casestr, sWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } // Upper the first character and lower others. if (!bFound) { gchar *nextchar = g_utf8_next_char(sWord); gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord); nextchar = g_utf8_strdown(nextchar, -1); casestr = g_strdup_printf("%s%s", firstchar, nextchar); g_free(firstchar); g_free(nextchar); if (strcmp(casestr, sWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } if (bIsPureEnglish(sWord)) { // If not Found , try other status of sWord. int iWordLen = strlen(sWord); bool isupcase; gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1); // cut one char "s" or "d" if (!bFound && iWordLen > 1) { isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2); if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d" if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } // cut "ly" if (!bFound && iWordLen > 2) { isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2); if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; // cut "ly" if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4] && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { // doubled sNewWord[iWordLen - 3] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; // restore } } if (!bFound) { if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } } // cut "ing" if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3); if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; if (iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5]) && !bIsVowel(sNewWord[iWordLen - 5]) && bIsVowel(sNewWord[iWordLen - 6])) { // doubled sNewWord[iWordLen - 4] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; // restore } } if (!bFound) { if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } if (!bFound) { if (isupcase) strcat(sNewWord, "E"); // add a char "E" else strcat(sNewWord, "e"); // add a char "e" if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } } // cut two char "es" if (!bFound && iWordLen > 3) { isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) && (sWord[iWordLen - 3] == 'S' || sWord[iWordLen - 3] == 'X' || sWord[iWordLen - 3] == 'O' || (iWordLen > 4 && sWord[iWordLen - 3] == 'H' && (sWord[iWordLen - 4] == 'C' || sWord[iWordLen - 4] == 'S')))); if (isupcase || (!strncmp(&sWord[iWordLen - 2], "es", 2) && (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' || sWord[iWordLen - 3] == 'o' || (iWordLen > 4 && sWord[iWordLen - 3] == 'h' && (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's'))))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } // cut "ed" if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2); if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]) && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { // doubled sNewWord[iWordLen - 3] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; // restore } } if (!bFound) { if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } } // cut "ied" , add "y". if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3); if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; if (isupcase) strcat(sNewWord, "Y"); // add a char "Y" else strcat(sNewWord, "y"); // add a char "y" if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } // cut "ies" , add "y". if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3); if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; if (isupcase) strcat(sNewWord, "Y"); // add a char "Y" else strcat(sNewWord, "y"); // add a char "y" if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } // cut "er". if (!bFound && iWordLen > 2) { isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2); if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } // cut "est". if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3); if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } } } g_free(sNewWord); } #if 0 else { //don't change iWordIndex here. //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words. //iWordIndex = INVALID_INDEX; } #endif return bFound; } bool Libs::SimpleLookupWord(const gchar *sWord, std::set &iWordIndices, int iLib) { bool bFound = oLib[iLib]->Lookup(sWord, iWordIndices); if (!bFound && fuzzy_) bFound = LookupSimilarWord(sWord, iWordIndices, iLib); return bFound; } bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size) { if (sWord[0] == '\0') return false; Fuzzystruct oFuzzystruct[reslist_size]; for (int i = 0; i < reslist_size; i++) { oFuzzystruct[i].pMatchWord = nullptr; oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance; } int iMaxDistance = iMaxFuzzyDistance; int iDistance; bool Found = false; EditDistance oEditDistance; glong iCheckWordLen; const char *sCheck; gunichar *ucs4_str1, *ucs4_str2; glong ucs4_str2_len; ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len); unicode_strdown(ucs4_str2); for (size_t iLib = 0; iLib < oLib.size(); ++iLib) { if (progress_func) progress_func(); // if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) { // there are Chinese dicts and English dicts... const int iwords = narticles(iLib); for (int index = 0; index < iwords; index++) { sCheck = poGetWord(index, iLib); // tolower and skip too long or too short words iCheckWordLen = g_utf8_strlen(sCheck, -1); if (iCheckWordLen - ucs4_str2_len >= iMaxDistance || ucs4_str2_len - iCheckWordLen >= iMaxDistance) continue; ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, nullptr); if (iCheckWordLen > ucs4_str2_len) ucs4_str1[ucs4_str2_len] = 0; unicode_strdown(ucs4_str1); iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance); g_free(ucs4_str1); if (iDistance < iMaxDistance && iDistance < ucs4_str2_len) { // when ucs4_str2_len=1,2 we need less fuzzy. Found = true; bool bAlreadyInList = false; int iMaxDistanceAt = 0; for (int j = 0; j < reslist_size; j++) { if (oFuzzystruct[j].pMatchWord && strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0) { // already in list bAlreadyInList = true; break; } // find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time. if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance) { iMaxDistanceAt = j; } } if (!bAlreadyInList) { if (oFuzzystruct[iMaxDistanceAt].pMatchWord) g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord); oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck); oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance; // calc new iMaxDistance iMaxDistance = iDistance; for (int j = 0; j < reslist_size; j++) { if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance) iMaxDistance = oFuzzystruct[j].iMatchWordDistance; } // calc new iMaxDistance } // add to list } // find one } // each word } // each lib g_free(ucs4_str2); if (Found) // sort with distance std::sort(oFuzzystruct, oFuzzystruct + reslist_size, [](const Fuzzystruct &lh, const Fuzzystruct &rh) -> bool { if (lh.iMatchWordDistance != rh.iMatchWordDistance) return lh.iMatchWordDistance < rh.iMatchWordDistance; if (lh.pMatchWord && rh.pMatchWord) return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0; return false; }); for (gint i = 0; i < reslist_size; ++i) reslist[i] = oFuzzystruct[i].pMatchWord; return Found; } gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord) { glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1]; gint iMatchCount = 0; GPatternSpec *pspec = g_pattern_spec_new(word); for (std::vector::size_type iLib = 0; iLib < oLib.size(); iLib++) { // if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib)) // -iMatchCount,so save time,but may got less result and the word may repeat. if (oLib[iLib]->LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1)) { if (progress_func) progress_func(); for (int i = 0; aiIndex[i] != -1; i++) { const gchar *sMatchWord = poGetWord(aiIndex[i], iLib); bool bAlreadyInList = false; for (int j = 0; j < iMatchCount; j++) { if (strcmp(ppMatchWord[j], sMatchWord) == 0) { // already in list bAlreadyInList = true; break; } } if (!bAlreadyInList) ppMatchWord[iMatchCount++] = g_strdup(sMatchWord); } } } g_pattern_spec_free(pspec); if (iMatchCount) // sort it. std::sort(ppMatchWord, ppMatchWord + iMatchCount, [](const char *lh, const char *rh) -> bool { return stardict_strcmp(lh, rh) < 0; }); return iMatchCount; } bool Libs::LookupData(const gchar *sWord, std::vector *reslist) { std::vector SearchWords; std::string SearchWord; const char *p = sWord; while (*p) { if (*p == '\\') { p++; switch (*p) { case ' ': SearchWord += ' '; break; case '\\': SearchWord += '\\'; break; case 't': SearchWord += '\t'; break; case 'n': SearchWord += '\n'; break; default: SearchWord += *p; } } else if (*p == ' ') { if (!SearchWord.empty()) { SearchWords.push_back(SearchWord); SearchWord.clear(); } } else { SearchWord += *p; } p++; } if (!SearchWord.empty()) { SearchWords.push_back(SearchWord); SearchWord.clear(); } if (SearchWords.empty()) return false; guint32 max_size = 0; gchar *origin_data = nullptr; for (std::vector::size_type i = 0; i < oLib.size(); ++i) { if (!oLib[i]->containSearchData()) continue; if (progress_func) progress_func(); const gulong iwords = narticles(i); const gchar *key; guint32 offset, size; for (gulong j = 0; j < iwords; ++j) { oLib[i]->get_key_and_data(j, &key, &offset, &size); if (size > max_size) { origin_data = (gchar *)g_realloc(origin_data, size); max_size = size; } if (oLib[i]->SearchData(SearchWords, offset, size, origin_data)) reslist[i].push_back(g_strdup(key)); } } g_free(origin_data); std::vector::size_type i; for (i = 0; i < oLib.size(); ++i) if (!reslist[i].empty()) break; return i != oLib.size(); } /**************************************************/ query_t analyze_query(const char *s, std::string &res) { if (!s || !*s) { res = ""; return qtSIMPLE; } if (*s == '/') { res = s + 1; return qtFUZZY; } if (*s == '|') { res = s + 1; return qtDATA; } bool regexp = false; const char *p = s; res = ""; for (; *p; res += *p, ++p) { if (*p == '\\') { ++p; if (!*p) break; continue; } if (*p == '*' || *p == '?') regexp = true; } if (regexp) return qtREGEXP; return qtSIMPLE; }