From ec8fb437882fa467551d3edf0d477cd12817cdd0 Mon Sep 17 00:00:00 2001 From: Damian Silbergleith <14797221+ds17f@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:17:03 -0700 Subject: [PATCH] fix: strip lyric pages whose credit uses a colon or band-as-author line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The safe-publish pass gated on the "words/music by ..." authorship line, so five real licensed-lyric pages slipped through with their lyrics intact: - eter, libe, onlytime, way2 — credit uses a colon ("Words: Hunter; music: Garcia") rather than "by" - ydha — credit is "By the Grateful Dead" (band-as-author, no words/music token at all) All five still carry the publisher's licensed-lyric signature, "Copyright Ice Nine Publishing; used by permission", which sits just above the lyric block exactly where the authorship line normally does. Add it as a fallback credit anchor in CREDIT_RE. It is the definitive marker of reproduced GD lyrics: essays that merely quote permission say "Used with permission" without naming Ice Nine, and pages whose blockquote is an annotation (operator's OED entry, slip's reader email) have no permission line, so they stay untouched. On the 118 "words by" pages the authorship line still matches first, leaving their stripped output byte-for-byte unchanged; exactly the five intended pages flip from skip to strip. make dist/safe + make audit stay green. Co-Authored-By: Claude Opus 4.8 --- scripts/safe_build.py | 53 +++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/scripts/safe_build.py b/scripts/safe_build.py index fce879f..1bf6a07 100644 --- a/scripts/safe_build.py +++ b/scripts/safe_build.py @@ -52,19 +52,42 @@ DIST = Path(__file__).parent.parent / "dist" SONGS_URL = "https://www.dead.net/songs" -# First annotation anchor: marks where the essay begins. Lyrics -# live before it; everything from it onward is annotation we keep. -SEAM_RE = re.compile(r" in the title heading at the very top (scarlet.html, +# stephen.html), so the first on the page is not the seam. +# +# Two forms anchor a song page. The usual one is the "words/music by ..." +# authorship line. But a handful of pages spell the authorship differently -- +# with a colon ("Words: Hunter; music: Garcia", e.g. libe/eter/onlytime/way2) or +# crediting the band as a whole ("By the Grateful Dead", e.g. ydha) -- which the +# authorship pattern misses. Those all still carry the publisher's licensed-lyric +# signature, "Copyright Ice Nine Publishing; used by permission", which sits just +# above the lyric block exactly as the authorship line does. It is the definitive +# marker of reproduced GD lyrics: essays that merely quote permission say "Used +# with permission" without naming Ice Nine (silber/miller/stephen/tribute), and +# pages whose blockquote is an annotation rather than licensed lyrics (operator's +# OED entry, slip's reader email) have no permission line at all. So we accept it +# as a fallback credit anchor. On the 118 "words by" pages the authorship line +# still matches first, leaving their output byte-for-byte unchanged. CREDIT_RE = re.compile( - r"(used by permission|words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by)", + r"words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by" + r"|copyright\s+ice\s+nine", re.I, ) +# The annotation seam: where commentary begins. A real section header is an +# ADJACENT to an

(either order), always introduced after the +# lyrics. This deliberately ignores inline anchors that sit *inside* the +# lyrics with no

(e.g. ripple.html's Let there be songs..., or +# ramble2.html's ), which a bare "" match would mistake +# for the seam and cut the lyric strip short. +SEAM_RE = re.compile( + r"

\s*]*>\s*(?:\s*)?.*?", re.I | re.S) # Some pages lay lyrics out as bare
-separated lines (or inside a layout @@ -107,16 +130,16 @@ def _lyric_start(text, lo, seam): def strip_page(text): """Return (new_text, n_blocks_removed) if this is a song page with lyrics to strip, else None to leave the page untouched.""" - seam_m = SEAM_RE.search(text) - if not seam_m: - return None # no annotation anchor -> not a song page - seam = seam_m.start() - - credit_m = CREDIT_RE.search(text[:seam]) + credit_m = CREDIT_RE.search(text) if not credit_m: return None # no song-credit line -> essay/bio, skip lo = credit_m.start() + seam_m = SEAM_RE.search(text, credit_m.end()) + if not seam_m: + return None # no annotation section -> nothing to bound + seam = seam_m.start() + # Case 1: lyrics wrapped in
(the common layout). Targets are the # blockquotes starting between the credit line and the seam. targets = [m for m in BLOCKQUOTE_RE.finditer(text) if lo <= m.start() < seam]