From f1fdc5b6607aed0821a364d509659f6a3c89c8cd Mon Sep 17 00:00:00 2001 From: 1700308200 <1700308200@qq.com> Date: Tue, 19 May 2026 00:26:16 +0800 Subject: [PATCH] Add auto-language extra for code block language detection (closes #361) --- CHANGES.md | 2 + README.md | 5 +- lib/markdown2.py | 313 +++++++++++++++++++++++++++ test/test_auto_language_detection.py | 276 +++++++++++++++++++++++ test/tm-cases/auto_language.html | 22 ++ test/tm-cases/auto_language.opts | 1 + test/tm-cases/auto_language.tags | 1 + test/tm-cases/auto_language.text | 19 ++ 8 files changed, 637 insertions(+), 2 deletions(-) create mode 100644 test/test_auto_language_detection.py create mode 100644 test/tm-cases/auto_language.html create mode 100644 test/tm-cases/auto_language.opts create mode 100644 test/tm-cases/auto_language.tags create mode 100644 test/tm-cases/auto_language.text diff --git a/CHANGES.md b/CHANGES.md index 718fb9ed..1077139f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,8 @@ - [pull #700] Fix XSS from code spans in image alt text (#699) - [pull #701] Allow boolean attribute syntax in `markdown-in-html` extra - [pull #704] Fix XSS from smuggling spans into image attributes (#702, #703) +- Add ``auto-language`` extra for automatic language detection in fenced + code blocks (issue #361) ## python-markdown2 2.5.5 diff --git a/README.md b/README.md index f1e89c9f..54850014 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,9 @@ as a script: '

boo!

\n' ``` There are a number of currently implemented extras for tables, footnotes, -syntax coloring of `
`-blocks, auto-linking patterns, table of contents,
-Smarty Pants (for fancy quotes, dashes, etc.) and more. See the [Extras
+syntax coloring of `
`-blocks, automatic language detection for fenced
+code blocks, auto-linking patterns, table of contents, Smarty Pants (for
+fancy quotes, dashes, etc.) and more. See the [Extras
 wiki page](https://github.com/trentm/python-markdown2/wiki/Extras) for full
 details.
 
diff --git a/lib/markdown2.py b/lib/markdown2.py
index dc698970..d942e102 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -4319,12 +4319,325 @@ def test(self, text):
         return '||' in text
 
 
+class AutoLanguage(Extra):
+    """
+    Automatically detect the programming language of fenced code blocks
+    that don't have an explicit language tag. Uses heuristic pattern
+    matching to identify common languages (Python, JavaScript, HTML,
+    CSS, SQL, Bash, Java, Go, Rust, Ruby, PHP, JSON, YAML, C/C++).
+
+    When combined with ``fenced-code-blocks`` and Pygments, this enables
+    syntax highlighting for code blocks without manual language markers.
+    """
+    name = 'auto-language'
+    order = (FencedCodeBlocks,), ()
+
+    _fenced_no_lang_re = re.compile(r'''
+        (?:\n+|\A\n?|(?<=\n))
+        (^[ \t]*`{3,})[ \t]*\n      # opening fence, no language
+        (.*?)                        # code content
+        \1[ \t]*\n                  # closing fence
+        ''', re.M | re.X | re.S)
+
+    def run(self, text: str):
+        def add_lang(match):
+            fence = match.group(1)
+            code = match.group(2)
+            # Preserve leading context (newlines) so FencedCodeBlocks
+            # can still match the modified block.
+            leading = match.string[match.start():match.regs[1][0]]
+            lang = detect_language(code)
+            if lang:
+                return f"{leading}{fence}{lang}\n{code}{fence}\n"
+            return match.group(0)
+
+        return self._fenced_no_lang_re.sub(add_lang, text)
+
+    def test(self, text: str) -> bool:
+        return '```' in text
+
+
+def detect_language(code: str):
+    """
+    Detect the programming language of a code snippet using heuristic
+    pattern scoring. Returns a Pygments-compatible language name, or
+    None if no language could be determined with sufficient confidence.
+    """
+    if not code or not code.strip():
+        return None
+
+    scores = {}
+
+    # -- Python --
+    s = 0
+    if re.search(r'^\s*(def\s+\w+\s*\(|class\s+\w+.*:)',
+                 code, re.M):
+        s += 4
+    if re.search(r'^\s*(import\s+\w+|from\s+\w+\s+import\b)',
+                 code, re.M):
+        s += 4
+    if re.search(r'\bself\.', code):
+        s += 3
+    if re.search(
+        r'^\s*(elif\s+|else:|except\s|finally:|try:|with\s|yield\b|raise\s)',
+        code, re.M
+    ):
+        s += 2
+    if re.search(r'\b(print|range|len|enumerate|zip|isinstance|hasattr)\(',
+                 code):
+        s += 1
+    if re.search(r'\b(lambda\s|__\w+__)', code):
+        s += 2
+    if re.search(r'\bNone\b', code):
+        s += 1
+    if re.search(r'^\s*@\w+', code, re.M):
+        s += 2
+    if re.search(r'\b(?:__name__|__main__)\b', code):
+        s += 2
+    scores['python'] = s
+
+    # -- JavaScript / TypeScript --
+    s = 0
+    if re.search(r'\b(?:const|let|var)\s+\w+\s*=', code):
+        s += 3
+    if re.search(r'\bfunction\s+\w+\s*\(', code):
+        s += 3
+    if re.search(r'=>\s*\{', code):
+        s += 3
+    if re.search(r'\bconsole\.(?:log|error|warn)\(', code):
+        s += 2
+    if re.search(r'\bdocument\.\w+', code):
+        s += 2
+    if re.search(
+        r'\b(?:require|module\.exports|export\s+(?:default\s+)?)\b',
+        code
+    ):
+        s += 2
+    if re.search(r'\bnew\s+\w+\s*\(', code):
+        s += 1
+    if re.search(r'\bnull\b', code):
+        s += 1
+    # TypeScript type annotations
+    if re.search(r':\s*(?:string|number|boolean|void)\b', code):
+        s += 2
+    scores['javascript'] = s
+
+    # -- HTML --
+    s = 0
+    if re.search(r' x * 2);
+console.log(doubled);'''),
+
+    ("javascript", '''export default function App() {
+    return 
Hello
; +}'''), + + ("html", '''
+

Hello World

+

This is a paragraph.

+
'''), + + ("html", ''' + +Test +

content

+'''), + + ("css", '''.button { + color: white; + background-color: blue; + padding: 10px 20px; + border-radius: 4px; +}'''), + + ("css", '''@media (max-width: 768px) { + .container { + width: 100%; + margin: 0; + } +}'''), + + ("sql", '''SELECT u.name, o.total +FROM users u +JOIN orders o ON u.id = o.user_id +WHERE o.created_at > '2024-01-01' +ORDER BY o.total DESC +LIMIT 10;'''), + + ("sql", '''CREATE TABLE products ( + id INTEGER PRIMARY KEY, + name VARCHAR(255) NOT NULL, + price DECIMAL(10, 2) +);'''), + + ("bash", '''#!/bin/bash +echo "Starting backup..." +tar -czf backup.tar.gz /data +echo "Done!"'''), + + ("bash", '''for file in *.txt; do + if [ -f "$file" ]; then + echo "Processing $file" + fi +done'''), + + ("java", '''public class HelloWorld { + public static void main(String[] args) { + System.out.println("Hello, World!"); + } +}'''), + + ("java", '''private String formatName(String first, String last) { + return last + ", " + first; +}'''), + + ("go", '''package main + +import "fmt" + +func main() { + name := "world" + fmt.Printf("Hello, %s!\\n", name) +}'''), + + ("go", '''type Server struct { + host string + port int +} + +func (s *Server) Start() error { + return nil +}'''), + + ("rust", '''fn factorial(n: u64) -> u64 { + match n { + 0 | 1 => 1, + _ => n * factorial(n - 1), + } +} + +fn main() { + println!("{}", factorial(5)); +}'''), + + ("ruby", '''def greet(name) + puts "Hello, #{name}!" +end + +class Person + attr_accessor :name +end'''), + + ("json", '''{ + "name": "John", + "age": 30, + "city": "New York", + "skills": ["Python", "JavaScript"] +}'''), + + ("yaml", '''--- +name: MyApp +version: "1.0" +dependencies: + - python>=3.8 + - requests +config: + debug: true + port: 8080'''), + + ("cpp", '''#include +#include + +int main(int argc, char *argv[]) { + printf("Hello, World!\\n"); + return 0; +}'''), + + ("php", '''A Python code block without a language tag:

+ +
+
def hello():
+    print("Hello")
+
+
+ +

A JavaScript code block without a language tag:

+ +
+
const x = 42;
+console.log(x);
+
+
+ +

An explicitly tagged code block should be left as-is:

+ +
+
echo hello
+
+
diff --git a/test/tm-cases/auto_language.opts b/test/tm-cases/auto_language.opts new file mode 100644 index 00000000..00937a1d --- /dev/null +++ b/test/tm-cases/auto_language.opts @@ -0,0 +1 @@ +{"extras": ["fenced-code-blocks", "auto-language"]} \ No newline at end of file diff --git a/test/tm-cases/auto_language.tags b/test/tm-cases/auto_language.tags new file mode 100644 index 00000000..2f27d86f --- /dev/null +++ b/test/tm-cases/auto_language.tags @@ -0,0 +1 @@ +extra fenced-code-blocks auto-language pygments \ No newline at end of file diff --git a/test/tm-cases/auto_language.text b/test/tm-cases/auto_language.text new file mode 100644 index 00000000..40d9087c --- /dev/null +++ b/test/tm-cases/auto_language.text @@ -0,0 +1,19 @@ +A Python code block without a language tag: + +``` +def hello(): + print("Hello") +``` + +A JavaScript code block without a language tag: + +``` +const x = 42; +console.log(x); +``` + +An explicitly tagged code block should be left as-is: + +```bash +echo hello +```