experimental: custom ID

URenko · URenko · commit 0627fcb464af · 2023-11-04T20:57:59.000+08:00
diff --git a/comiclib/main.py b/comiclib/main.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3.dev"
 print(f" >>> ComicLib v{__version__}")
 
 from .scan import watch, scannow
diff --git a/comiclib/scan.py b/comiclib/scan.py
@@ -44,14 +44,13 @@ def scan(paths):
                 Archive.path == p.as_posix()))
             if old_a is None:
                 a = Archive(path=p.as_posix())
-                archive_id = hashlib.blake2b(
-                    p.as_posix().encode(), digest_size=20).hexdigest()
+                archive_id = '00' + hashlib.blake2b(p.as_posix().encode(), digest_size=19).hexdigest()  # 00 stands for ID type origin
             elif settings.skip_exits:
                 continue
             else:
                 a = old_a
                 archive_id = old_a.id
-            metadata = {"title": a.title, "subtitle": a.subtitle, "source": a.source, "pagecount": a.pagecount, "tags": set(
+            metadata = {"id": archive_id, "title": a.title, "subtitle": a.subtitle, "source": a.source, "pagecount": a.pagecount, "tags": set(
                 t.tag for t in a.tags if not t.tag.startswith("date_added:")), "categories": set(c.name for c in a.categories)}
             real_path = Path(settings.content) / p
             prev_scanners = []
@@ -76,7 +75,7 @@ def scan(paths):
             for tag in filter(lambda t: not t.tag in metadata["tags"], a.tags):
                 a.tags.remove(tag)
             for tag in metadata["tags"] - set(t.tag for t in a.tags):
-                a.tags.append(Tag(archive_id=archive_id, tag=tag))
+                a.tags.append(Tag(archive_id=metadata["id"], tag=tag))
             for category in filter(lambda c: not c.name in metadata["categories"], a.categories):
                 a.categories.remove(category)
             for category in metadata["categories"] - set(c.name for c in a.categories):
@@ -85,7 +84,7 @@ def scan(paths):
                     db.add(c)
                 a.categories.append(c)
             if old_a is None:
-                a.id = archive_id
+                a.id = metadata["id"]
                 db.add(a)
             db.commit()
 
diff --git a/comiclib/scanner/30-importEHdb.py b/comiclib/scanner/30-importEHdb.py
@@ -76,8 +76,10 @@ def scan(self, path: Path, id: str, metadata: dict, prev_scanners: list[str]) ->
             return False
         elif prev_scanners and not (gid := self.get_gid(metadata)) is None:
             logger.info(f' <- {path}')
-            res = self.con.execute("SELECT title, title_jpn, category, posted, thumb, artist, `group`, parody, character, female, male, language, mixed, other, cosplayer, rest FROM gallery WHERE gid == ?", (gid,)).fetchone()
+            res = self.con.execute("SELECT title, title_jpn, category, posted, thumb, token, artist, `group`, parody, character, female, male, language, mixed, other, cosplayer, rest FROM gallery WHERE gid == ?", (gid,)).fetchone()
             if res is None: return False
+            token = res.pop('token')
+            metadata["id"] = f"EH{gid:>018}{token}{id[-10:]}"
             metadata["title"] = res.pop("title")
             metadata["subtitle"] = res.pop("title_jpn")
             thumb = res.pop("thumb")
@@ -90,7 +92,6 @@ def scan(self, path: Path, id: str, metadata: dict, prev_scanners: list[str]) ->
                 if res[namespace] is None: continue
                 metadata["tags"] |= set(map(lambda v: namespace+':'+v, ast.literal_eval(res[namespace])))
             if metadata["source"] is None or not re.fullmatch(r"https?://e[x-]hentai\.org/g/(\d+)/", metadata["source"]) is None:
-                token = self.con.execute("SELECT token FROM gallery WHERE gid == ?", (gid,)).fetchone()['token']
                 metadata["source"] = f"https://exhentai.org/g/{gid}/{token}/"
             return True
         else:
diff --git a/comiclib/scanner/40-thumb.py b/comiclib/scanner/40-thumb.py
@@ -11,6 +11,6 @@ def scan(self, path: Path, id: str, metadata: dict, prev_scanners: list[str]) ->
         if not prev_scanners or not metadata.get('thumb') is None:
             return False
         logger.info(f' <- {path}')
-        thumb = extract_thumbnail(path, id, 1, cover=True)
+        thumb = extract_thumbnail(path, metadata['id'], 1, cover=True)
         metadata['thumb'] = str(thumb)
         return True
diff --git a/docs/en/docs/scanner.md b/docs/en/docs/scanner.md
@@ -104,6 +104,13 @@ This return value is also used as a reference for subsequent scripts.
 Parameters of `Scanner.scan`:
 
 * `path`: file/directory path for the input comic.
-* `id`: Unique ID generated by ComicLib, do not attempt to write this value.
-* `metadata`: Metadata obtained after processing by the previous script. The fields include `title`, `subtitle` `source`, `pagecount`, `tags`, `categories`. The initial values are `None` or `set()`. scanners write the resulting metadata into this `dict`.
+* `id`: The unique ID pre-generated by ComicLib, which is a hash of the relative path to `CONTENT`. The database uses the ID given by `metadata[id]` instead, see the description of custom ID below.
+* `metadata`: Metadata obtained after processing by the previous script. The fields include `id`, `title`, `subtitle` `source`, `pagecount`, `tags`, `categories`. The initial values are `None` or `set()`, except for `id`. scanners write the resulting metadata into this `dict`.
 * `prev_scanners`: The name of the script that previously returned `True`.
+
+!!! example "custom ID (Experimental)"
+    ComicLib first pre-generates a unique ID based on the path, starting with `00` as the value of the parameter `id`. Initially this ID is the same as `metadata[id]`.
+    The scanner can generate a new ID based on `id`, `metadata[id]` modified by the previous scanning script and other information, and write it into `metadata[id]`.
+    It is generally agreed that the first two characters of the ID represent the meaning of the ID. For example, the built-in scanner 30-importEHdb.py uses `EH` to represent its designed ID with ehentai gid information.
+    The final `metadata[id]` is written to the database as a unique identifier for the comic.
+    ID must be unique, and be 40 characters. Custom IDs will not work for updating metadata during rescanning
diff --git a/docs/zh/docs/scanner.md b/docs/zh/docs/scanner.md
@@ -103,6 +103,13 @@ class Scanner:
 `Scanner.scan` 的参数：
 
 * `path`: 文件/目录路径
-* `id`: 由 ComicLib 生成的唯一 ID，不要试图写入此值
-* `metadata`: 由前面脚本处理后得到的元数据，字段有 `title`, `subtitle` `source`, `pagecount`, `tags`, `categories`，最初值皆为 `None` 或 `set()`。扫描脚本将得到的元数据写入该 `dict`。
+* `id`: 由 ComicLib 预生成的唯一 ID，是相对于 `CONTENT` 的路径的散列值，但最终数据库使用的是 `metadata[id]` 给出的 ID，见下面自定义ID的说明。
+* `metadata`: 由前面脚本处理后得到的元数据，字段有 `id`, `title`, `subtitle` `source`, `pagecount`, `tags`, `categories`，除`id` 外最初值皆为 `None` 或 `set()`。扫描脚本将得到的元数据写入该 `dict`。
 * `prev_scanners`: 前面返回 `True` 的脚本名称。
+
+!!! example "自定义ID（试验性）"
+    ComicLib 先根据路径预生成一个唯一 ID，以 `00` 开头，作为参数 `id` 的值。最初这一 ID 与 `metadata[id]` 相同。
+    扫描脚本可以根据 `id` 、前面扫描脚本修改的 `metadata[id]` 和其他信息生成一个新的 ID，写入 `metadata[id]`。
+    一般约定 ID 的前两个字符表示 ID 的含义，如内置脚本 30-importEHdb.py 用 `EH` 表示其设计的带有 ehentai gid 信息的 ID。
+    最终的 `metadata[id]` 作为漫画的唯一标识符写入数据库。
+    ID 必须保证唯一，且为 40 个字符。自定义 ID 对重新扫描更新元数据无效。
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "comiclib"
-version = "0.0.2"
+version = "0.0.3.dev"
 authors = [
   { name="URenko" },
 ]

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.0.2"`
	`1`	`+__version__ = "0.0.3.dev"`
`2`	`2`	`print(f" >>> ComicLib v{__version__}")`
`3`	`3`
`4`	`4`	`from .scan import watch, scannow`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "comiclib"`
`7`		`-version = "0.0.2"`
	`7`	`+version = "0.0.3.dev"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="URenko" },`
`10`	`10`	`]`