diff --git a/scripts/safe_build.py b/scripts/safe_build.py index fce879f..1bf6a07 100644 --- a/scripts/safe_build.py +++ b/scripts/safe_build.py @@ -52,19 +52,42 @@ DIST = Path(__file__).parent.parent / "dist" SONGS_URL = "https://www.dead.net/songs" -# First annotation anchor: marks where the essay begins. Lyrics -# live before it; everything from it onward is annotation we keep. -SEAM_RE = re.compile(r" in the title heading at the very top (scarlet.html, +# stephen.html), so the first on the page is not the seam. +# +# Two forms anchor a song page. The usual one is the "words/music by ..." +# authorship line. But a handful of pages spell the authorship differently -- +# with a colon ("Words: Hunter; music: Garcia", e.g. libe/eter/onlytime/way2) or +# crediting the band as a whole ("By the Grateful Dead", e.g. ydha) -- which the +# authorship pattern misses. Those all still carry the publisher's licensed-lyric +# signature, "Copyright Ice Nine Publishing; used by permission", which sits just +# above the lyric block exactly as the authorship line does. It is the definitive +# marker of reproduced GD lyrics: essays that merely quote permission say "Used +# with permission" without naming Ice Nine (silber/miller/stephen/tribute), and +# pages whose blockquote is an annotation rather than licensed lyrics (operator's +# OED entry, slip's reader email) have no permission line at all. So we accept it +# as a fallback credit anchor. On the 118 "words by" pages the authorship line +# still matches first, leaving their output byte-for-byte unchanged. CREDIT_RE = re.compile( - r"(used by permission|words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by)", + r"words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by" + r"|copyright\s+ice\s+nine", re.I, ) +# The annotation seam: where commentary begins. A real section header is an +# ADJACENT to an

(either order), always introduced after the +# lyrics. This deliberately ignores inline anchors that sit *inside* the +# lyrics with no

(e.g. ripple.html's Let there be songs..., or +# ramble2.html's ), which a bare "" match would mistake +# for the seam and cut the lyric strip short. +SEAM_RE = re.compile( + r"

\s*]*>\s*(?:\s*)?.*?", re.I | re.S) # Some pages lay lyrics out as bare
-separated lines (or inside a layout @@ -107,16 +130,16 @@ def _lyric_start(text, lo, seam): def strip_page(text): """Return (new_text, n_blocks_removed) if this is a song page with lyrics to strip, else None to leave the page untouched.""" - seam_m = SEAM_RE.search(text) - if not seam_m: - return None # no annotation anchor -> not a song page - seam = seam_m.start() - - credit_m = CREDIT_RE.search(text[:seam]) + credit_m = CREDIT_RE.search(text) if not credit_m: return None # no song-credit line -> essay/bio, skip lo = credit_m.start() + seam_m = SEAM_RE.search(text, credit_m.end()) + if not seam_m: + return None # no annotation section -> nothing to bound + seam = seam_m.start() + # Case 1: lyrics wrapped in
(the common layout). Targets are the # blockquotes starting between the credit line and the seam. targets = [m for m in BLOCKQUOTE_RE.finditer(text) if lo <= m.start() < seam]