From f1fdc5b6607aed0821a364d509659f6a3c89c8cd Mon Sep 17 00:00:00 2001 From: 1700308200 <1700308200@qq.com> Date: Tue, 19 May 2026 00:26:16 +0800 Subject: [PATCH] Add auto-language extra for code block language detection (closes #361) --- CHANGES.md | 2 + README.md | 5 +- lib/markdown2.py | 313 +++++++++++++++++++++++++++ test/test_auto_language_detection.py | 276 +++++++++++++++++++++++ test/tm-cases/auto_language.html | 22 ++ test/tm-cases/auto_language.opts | 1 + test/tm-cases/auto_language.tags | 1 + test/tm-cases/auto_language.text | 19 ++ 8 files changed, 637 insertions(+), 2 deletions(-) create mode 100644 test/test_auto_language_detection.py create mode 100644 test/tm-cases/auto_language.html create mode 100644 test/tm-cases/auto_language.opts create mode 100644 test/tm-cases/auto_language.tags create mode 100644 test/tm-cases/auto_language.text diff --git a/CHANGES.md b/CHANGES.md index 718fb9ed..1077139f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,8 @@ - [pull #700] Fix XSS from code spans in image alt text (#699) - [pull #701] Allow boolean attribute syntax in `markdown-in-html` extra - [pull #704] Fix XSS from smuggling spans into image attributes (#702, #703) +- Add ``auto-language`` extra for automatic language detection in fenced + code blocks (issue #361) ## python-markdown2 2.5.5 diff --git a/README.md b/README.md index f1e89c9f..54850014 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,9 @@ as a script: '
boo!
\n' ``` There are a number of currently implemented extras for tables, footnotes, -syntax coloring of ``-blocks, auto-linking patterns, table of contents,
-Smarty Pants (for fancy quotes, dashes, etc.) and more. See the [Extras
+syntax coloring of ``-blocks, automatic language detection for fenced
+code blocks, auto-linking patterns, table of contents, Smarty Pants (for
+fancy quotes, dashes, etc.) and more. See the [Extras
wiki page](https://github.com/trentm/python-markdown2/wiki/Extras) for full
details.
diff --git a/lib/markdown2.py b/lib/markdown2.py
index dc698970..d942e102 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -4319,12 +4319,325 @@ def test(self, text):
return '||' in text
+class AutoLanguage(Extra):
+ """
+ Automatically detect the programming language of fenced code blocks
+ that don't have an explicit language tag. Uses heuristic pattern
+ matching to identify common languages (Python, JavaScript, HTML,
+ CSS, SQL, Bash, Java, Go, Rust, Ruby, PHP, JSON, YAML, C/C++).
+
+ When combined with ``fenced-code-blocks`` and Pygments, this enables
+ syntax highlighting for code blocks without manual language markers.
+ """
+ name = 'auto-language'
+ order = (FencedCodeBlocks,), ()
+
+ _fenced_no_lang_re = re.compile(r'''
+ (?:\n+|\A\n?|(?<=\n))
+ (^[ \t]*`{3,})[ \t]*\n # opening fence, no language
+ (.*?) # code content
+ \1[ \t]*\n # closing fence
+ ''', re.M | re.X | re.S)
+
+ def run(self, text: str):
+ def add_lang(match):
+ fence = match.group(1)
+ code = match.group(2)
+ # Preserve leading context (newlines) so FencedCodeBlocks
+ # can still match the modified block.
+ leading = match.string[match.start():match.regs[1][0]]
+ lang = detect_language(code)
+ if lang:
+ return f"{leading}{fence}{lang}\n{code}{fence}\n"
+ return match.group(0)
+
+ return self._fenced_no_lang_re.sub(add_lang, text)
+
+ def test(self, text: str) -> bool:
+ return '```' in text
+
+
+def detect_language(code: str):
+ """
+ Detect the programming language of a code snippet using heuristic
+ pattern scoring. Returns a Pygments-compatible language name, or
+ None if no language could be determined with sufficient confidence.
+ """
+ if not code or not code.strip():
+ return None
+
+ scores = {}
+
+ # -- Python --
+ s = 0
+ if re.search(r'^\s*(def\s+\w+\s*\(|class\s+\w+.*:)',
+ code, re.M):
+ s += 4
+ if re.search(r'^\s*(import\s+\w+|from\s+\w+\s+import\b)',
+ code, re.M):
+ s += 4
+ if re.search(r'\bself\.', code):
+ s += 3
+ if re.search(
+ r'^\s*(elif\s+|else:|except\s|finally:|try:|with\s|yield\b|raise\s)',
+ code, re.M
+ ):
+ s += 2
+ if re.search(r'\b(print|range|len|enumerate|zip|isinstance|hasattr)\(',
+ code):
+ s += 1
+ if re.search(r'\b(lambda\s|__\w+__)', code):
+ s += 2
+ if re.search(r'\bNone\b', code):
+ s += 1
+ if re.search(r'^\s*@\w+', code, re.M):
+ s += 2
+ if re.search(r'\b(?:__name__|__main__)\b', code):
+ s += 2
+ scores['python'] = s
+
+ # -- JavaScript / TypeScript --
+ s = 0
+ if re.search(r'\b(?:const|let|var)\s+\w+\s*=', code):
+ s += 3
+ if re.search(r'\bfunction\s+\w+\s*\(', code):
+ s += 3
+ if re.search(r'=>\s*\{', code):
+ s += 3
+ if re.search(r'\bconsole\.(?:log|error|warn)\(', code):
+ s += 2
+ if re.search(r'\bdocument\.\w+', code):
+ s += 2
+ if re.search(
+ r'\b(?:require|module\.exports|export\s+(?:default\s+)?)\b',
+ code
+ ):
+ s += 2
+ if re.search(r'\bnew\s+\w+\s*\(', code):
+ s += 1
+ if re.search(r'\bnull\b', code):
+ s += 1
+ # TypeScript type annotations
+ if re.search(r':\s*(?:string|number|boolean|void)\b', code):
+ s += 2
+ scores['javascript'] = s
+
+ # -- HTML --
+ s = 0
+ if re.search(r' x * 2);
+console.log(doubled);'''),
+
+ ("javascript", '''export default function App() {
+ return Hello;
+}'''),
+
+ ("html", '''
+ Hello World
+ This is a paragraph.
+'''),
+
+ ("html", '''
+
+Test
+content
+'''),
+
+ ("css", '''.button {
+ color: white;
+ background-color: blue;
+ padding: 10px 20px;
+ border-radius: 4px;
+}'''),
+
+ ("css", '''@media (max-width: 768px) {
+ .container {
+ width: 100%;
+ margin: 0;
+ }
+}'''),
+
+ ("sql", '''SELECT u.name, o.total
+FROM users u
+JOIN orders o ON u.id = o.user_id
+WHERE o.created_at > '2024-01-01'
+ORDER BY o.total DESC
+LIMIT 10;'''),
+
+ ("sql", '''CREATE TABLE products (
+ id INTEGER PRIMARY KEY,
+ name VARCHAR(255) NOT NULL,
+ price DECIMAL(10, 2)
+);'''),
+
+ ("bash", '''#!/bin/bash
+echo "Starting backup..."
+tar -czf backup.tar.gz /data
+echo "Done!"'''),
+
+ ("bash", '''for file in *.txt; do
+ if [ -f "$file" ]; then
+ echo "Processing $file"
+ fi
+done'''),
+
+ ("java", '''public class HelloWorld {
+ public static void main(String[] args) {
+ System.out.println("Hello, World!");
+ }
+}'''),
+
+ ("java", '''private String formatName(String first, String last) {
+ return last + ", " + first;
+}'''),
+
+ ("go", '''package main
+
+import "fmt"
+
+func main() {
+ name := "world"
+ fmt.Printf("Hello, %s!\\n", name)
+}'''),
+
+ ("go", '''type Server struct {
+ host string
+ port int
+}
+
+func (s *Server) Start() error {
+ return nil
+}'''),
+
+ ("rust", '''fn factorial(n: u64) -> u64 {
+ match n {
+ 0 | 1 => 1,
+ _ => n * factorial(n - 1),
+ }
+}
+
+fn main() {
+ println!("{}", factorial(5));
+}'''),
+
+ ("ruby", '''def greet(name)
+ puts "Hello, #{name}!"
+end
+
+class Person
+ attr_accessor :name
+end'''),
+
+ ("json", '''{
+ "name": "John",
+ "age": 30,
+ "city": "New York",
+ "skills": ["Python", "JavaScript"]
+}'''),
+
+ ("yaml", '''---
+name: MyApp
+version: "1.0"
+dependencies:
+ - python>=3.8
+ - requests
+config:
+ debug: true
+ port: 8080'''),
+
+ ("cpp", '''#include
+#include
+
+int main(int argc, char *argv[]) {
+ printf("Hello, World!\\n");
+ return 0;
+}'''),
+
+ ("php", '''A Python code block without a language tag:
+
+
+def hello():
+ print("Hello")
+
+
+
+A JavaScript code block without a language tag:
+
+
+const x = 42;
+console.log(x);
+
+
+
+An explicitly tagged code block should be left as-is:
+
+
+echo hello
+
+
diff --git a/test/tm-cases/auto_language.opts b/test/tm-cases/auto_language.opts
new file mode 100644
index 00000000..00937a1d
--- /dev/null
+++ b/test/tm-cases/auto_language.opts
@@ -0,0 +1 @@
+{"extras": ["fenced-code-blocks", "auto-language"]}
\ No newline at end of file
diff --git a/test/tm-cases/auto_language.tags b/test/tm-cases/auto_language.tags
new file mode 100644
index 00000000..2f27d86f
--- /dev/null
+++ b/test/tm-cases/auto_language.tags
@@ -0,0 +1 @@
+extra fenced-code-blocks auto-language pygments
\ No newline at end of file
diff --git a/test/tm-cases/auto_language.text b/test/tm-cases/auto_language.text
new file mode 100644
index 00000000..40d9087c
--- /dev/null
+++ b/test/tm-cases/auto_language.text
@@ -0,0 +1,19 @@
+A Python code block without a language tag:
+
+```
+def hello():
+ print("Hello")
+```
+
+A JavaScript code block without a language tag:
+
+```
+const x = 42;
+console.log(x);
+```
+
+An explicitly tagged code block should be left as-is:
+
+```bash
+echo hello
+```