Skip to content

Commit dcf8fd0

Browse files
committed
importEHdb: persistence cache with SQL index
This reduces RAM usage and statup time.
1 parent dab32cf commit dcf8fd0

1 file changed

Lines changed: 45 additions & 21 deletions

File tree

comiclib/scanner/30-importEHdb.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
from typing import Union
33
import sqlite3, re, ast
44
from datetime import date
5+
from urllib.parse import urlsplit
56
from pydantic import Field
67
from pydantic_settings import BaseSettings
78

89
class Settings(BaseSettings):
910
importEHdb_thumb: bool = True
1011
importEHdb_matchtitle: Union[bool, str] = Field(default=True, union_mode='left_to_right')
1112
importEHdb_matchtorrent: bool = True
12-
importEHdb_API_DUMP_PATH: str = "api_dump.sqlite"
13+
importEHdb_database_URI: str = "file:api_dump.sqlite?mode=rw"
1314
settings = Settings()
1415

1516
import logging
@@ -30,6 +31,14 @@ def blur_title(title: str):
3031
'+': None
3132
}))
3233

34+
# Please update the version once the above function is updated.
35+
blur_title_version = 1
36+
title2gid_table = 'gallery' if settings.importEHdb_matchtitle == 'exact' else f'comiclib_title2gid_v{blur_title_version}'
37+
title2gid_index = 'gallery_title' if settings.importEHdb_matchtitle == 'exact' else f'title_index_v{blur_title_version}'
38+
titlejpn2gid_table = f"comiclib_titlejpn2gid_v{0 if settings.importEHdb_matchtitle == 'exact' else blur_title_version}"
39+
titlejpn2gid_index = f"titlejpn_index_v{0 if settings.importEHdb_matchtitle == 'exact' else blur_title_version}"
40+
# The title_jpn field in table galley contains many NULL, thus use table titlejpn_index_v0 for index instead here.
41+
3342
def dict_factory(cursor, row):
3443
fields = [column[0] for column in cursor.description]
3544
return {key: value for key, value in zip(fields, row)}
@@ -39,40 +48,55 @@ class Scanner:
3948
Currently only support matching by the source URL (from previous scanners).'''
4049

4150
def __init__(self) -> None:
42-
if Path(settings.importEHdb_API_DUMP_PATH).exists():
43-
logger.info('Loading ehentai metadata database, please wait...')
44-
# do it in readonly mode, to maintain a readonly container image
45-
self.con = sqlite3.connect("file:"+settings.importEHdb_API_DUMP_PATH+"?mode=ro&immutable=1", uri=True, check_same_thread=False)
51+
db_path = urlsplit(settings.importEHdb_database_URI).path
52+
if Path(db_path).exists():
53+
self.con = sqlite3.connect(settings.importEHdb_database_URI, uri=True, check_same_thread=False)
54+
# Build cache during the first run
4655
if settings.importEHdb_matchtitle:
47-
self.db_title = {blur_title(row[0]): row[1] for row in self.con.execute("SELECT title, gid FROM gallery") if not row[0] is None}
48-
self.db_title_jpn = {blur_title(row[0]): row[1] for row in self.con.execute("SELECT title_jpn, gid FROM gallery") if not row[0] is None}
49-
if settings.importEHdb_matchtorrent:
50-
self.db_title_torrent = {}
56+
req_title2gid_index = self.con.execute(f"SELECT name FROM sqlite_master WHERE type='index' AND name=?", (title2gid_index,)).fetchone() is None
57+
req_titlejpn2gid_index = self.con.execute(f"SELECT name FROM sqlite_master WHERE type='index' AND name=?", (titlejpn2gid_index,)).fetchone() is None
58+
if req_title2gid_index or req_titlejpn2gid_index:
59+
logger.info('Building ehentai database title cache, please wait...')
60+
if req_title2gid_index:
61+
if settings.importEHdb_matchtitle != 'exact':
62+
self.con.execute(f"CREATE TABLE {title2gid_table} (title TEXT NOT NULL, gid INTEGER NOT NULL, FOREIGN KEY (gid) REFERENCES gallery (gid))")
63+
self.con.executemany(f"INSERT INTO {title2gid_table} VALUES(?, ?)", ((blur_title(row[0]), row[1]) for row in self.con.execute("SELECT title, gid FROM gallery") if row[0] is not None))
64+
self.con.execute(f"CREATE INDEX {title2gid_index} ON {title2gid_table} (title)")
65+
if req_titlejpn2gid_index:
66+
self.con.execute(f"CREATE TABLE {titlejpn2gid_table} (title_jpn TEXT NOT NULL, gid INTEGER NOT NULL, FOREIGN KEY (gid) REFERENCES gallery (gid))")
67+
self.con.executemany(f"INSERT INTO {titlejpn2gid_table} VALUES(?, ?)", ((blur_title(row[0]), row[1]) for row in self.con.execute("SELECT title_jpn, gid FROM gallery") if row[0] is not None))
68+
self.con.execute(f"CREATE INDEX {titlejpn2gid_index} ON {titlejpn2gid_table} (title_jpn)")
69+
self.con.commit()
70+
if settings.importEHdb_matchtorrent and self.con.execute(f"SELECT name FROM sqlite_master WHERE type='index' AND name='torrent_index_v{blur_title_version}'").fetchone() is None:
71+
logger.info('Building ehentai database torrent cache, please wait...')
72+
self.con.execute(f"CREATE TABLE comiclib_torrent2gid_v{blur_title_version} (title TEXT NOT NULL, gid INTEGER NOT NULL, FOREIGN KEY (gid) REFERENCES gallery (gid))")
5173
for torrents, gid in self.con.execute("SELECT torrents, gid FROM gallery"):
5274
if torrents is None: continue
5375
for torrent in ast.literal_eval(torrents):
5476
if torrent['name'] is None: continue
55-
self.db_title_torrent[blur_title(Path(torrent['name']).stem)] = gid
77+
self.con.execute(f"INSERT INTO comiclib_torrent2gid_v{blur_title_version} VALUES(?, ?)", (blur_title(Path(torrent['name']).stem), gid))
78+
self.con.execute(f"CREATE INDEX torrent_index_v{blur_title_version} ON comiclib_torrent2gid_v{blur_title_version} (title)")
79+
self.con.commit()
5680
self.con.row_factory = dict_factory
57-
logger.info('Loaded.')
5881
else:
82+
logger.warning(f"{db_path} not found. importEHdb.py will be skipped. Please download it from https://sukebei.nyaa.si/user/gipaf23445 if you need it.")
5983
self.con = None
6084

6185
def get_gid(self, metadata: dict):
6286
if not metadata["source"] is None and not (m := re.match(r"https?://e[x-]hentai\.org/g/(\d+)/", metadata["source"])) is None:
6387
return m[1]
6488
if settings.importEHdb_matchtitle:
65-
if not (gid := self.db_title.get(blur_title(metadata["title"]))) is None:
66-
return gid
67-
elif not (gid := self.db_title_jpn.get(blur_title(metadata["title"]))) is None:
68-
return gid
69-
elif not (gid := self.db_title_jpn.get(blur_title(metadata["subtitle"]))) is None:
70-
return gid
89+
if (row := self.con.execute(f"SELECT gid FROM {title2gid_table} INDEXED BY {title2gid_index} WHERE title == ?", (blur_title(metadata["title"]),)).fetchone()) is not None:
90+
return row['gid']
91+
elif (row := self.con.execute(f"SELECT gid FROM {titlejpn2gid_table} INDEXED BY {titlejpn2gid_index} WHERE title_jpn == ?", (blur_title(metadata["title"]),)).fetchone()) is not None:
92+
return row['gid']
93+
elif (row := self.con.execute(f"SELECT gid FROM {titlejpn2gid_table} INDEXED BY {titlejpn2gid_index} WHERE title_jpn == ?", (blur_title(metadata["subtitle"]),)).fetchone()) is not None:
94+
return row['gid']
7195
if settings.importEHdb_matchtorrent:
72-
if not (gid := self.db_title_torrent.get(blur_title(metadata["title"]))) is None:
73-
return gid
74-
elif not (gid := self.db_title_torrent.get(blur_title(metadata["subtitle"]))) is None:
75-
return gid
96+
if (row := self.con.execute(f"SELECT gid FROM comiclib_torrent2gid_v{blur_title_version} INDEXED BY torrent_index_v{blur_title_version} WHERE title == ?", (blur_title(metadata["title"]),)).fetchone()) is not None:
97+
return row['gid']
98+
elif (row := self.con.execute(f"SELECT gid FROM comiclib_torrent2gid_v{blur_title_version} INDEXED BY torrent_index_v{blur_title_version} WHERE title == ?", (blur_title(metadata["subtitle"]),)).fetchone()) is not None:
99+
return row['gid']
76100
return None
77101

78102
def scan(self, path: Path, id: str, metadata: dict, prev_scanners: list[str]) -> bool:

0 commit comments

Comments
 (0)