feat(cyber_security): add file_integrity_monitor and url_analyzer; add idna dep

Saiprashanth Pulisetti · Saiprashanth Pulisetti · commit efcef14e7e1c · 2025-10-02T15:08:22.000+05:30
diff --git a/cyber_security/file_integrity_monitor.py b/cyber_security/file_integrity_monitor.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""File Integrity Monitor (FIM).
+
+Two modes:
+ - init: hash files and create baseline JSON
+ - verify: re-hash and report modifications/additions/deletions
+
+Usage:
+  python -m cyber_security.file_integrity_monitor init --root /path --output baseline.json --glob "**/*.py"
+  python -m cyber_security.file_integrity_monitor verify --root /path --baseline baseline.json
+"""
+from __future__ import annotations
+
+import argparse
+import fnmatch
+import hashlib
+import json
+import os
+import sys
+from dataclasses import dataclass
+from typing import Dict, Iterable
+
+
+@dataclass
+class FileHash:
+    path: str
+    sha256: str
+
+
+def iter_files(root: str, pattern: str | None) -> Iterable[str]:
+    for base, _dirs, files in os.walk(root):
+        for name in files:
+            rel = os.path.relpath(os.path.join(base, name), root)
+            if pattern is None or fnmatch.fnmatch(rel, pattern):
+                yield rel
+
+
+def hash_file(root: str, rel_path: str) -> str:
+    h = hashlib.sha256()
+    abs_path = os.path.join(root, rel_path)
+    with open(abs_path, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def build_baseline(root: str, pattern: str | None) -> Dict[str, str]:
+    result: Dict[str, str] = {}
+    for rel in iter_files(root, pattern):
+        try:
+            result[rel] = hash_file(root, rel)
+        except (PermissionError, FileNotFoundError):
+            continue
+    return result
+
+
+def write_json(path: str, data: Dict[str, str]) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+
+
+def read_json(path: str) -> Dict[str, str]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def cmd_init(args: argparse.Namespace) -> int:
+    baseline = build_baseline(args.root, args.glob)
+    write_json(args.output, baseline)
+    print(f"Baseline written: {args.output} ({len(baseline)} files)")
+    return 0
+
+
+def cmd_verify(args: argparse.Namespace) -> int:
+    prior = read_json(args.baseline)
+    current = build_baseline(args.root, None)
+
+    added = sorted(set(current) - set(prior))
+    removed = sorted(set(prior) - set(current))
+    modified = sorted([p for p in set(current) & set(prior) if current[p] != prior[p]])
+
+    if added:
+        print("Added:")
+        for p in added:
+            print(f" + {p}")
+    if removed:
+        print("Removed:")
+        for p in removed:
+            print(f" - {p}")
+    if modified:
+        print("Modified:")
+        for p in modified:
+            print(f" * {p}")
+
+    if not (added or removed or modified):
+        print("No changes detected.")
+        return 0
+
+    return 1
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="File Integrity Monitor (FIM)")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_init = sub.add_parser("init", help="Create baseline JSON of file hashes")
+    p_init.add_argument("--root", required=True, help="Root directory to scan")
+    p_init.add_argument("--output", required=True, help="Path to baseline JSON output")
+    p_init.add_argument("--glob", help="Glob-like pattern relative to root (e.g., **/*.py)")
+    p_init.set_defaults(func=cmd_init)
+
+    p_ver = sub.add_parser("verify", help="Verify current state against baseline JSON")
+    p_ver.add_argument("--root", required=True, help="Root directory to scan")
+    p_ver.add_argument("--baseline", required=True, help="Path to baseline JSON")
+    p_ver.set_defaults(func=cmd_verify)
+
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(sys.argv[1:] if argv is None else argv)
+    return int(args.func(args))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/cyber_security/url_analyzer.py b/cyber_security/url_analyzer.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""URL Analyzer: heuristic checks for phishing and IDN homograph risks.
+
+Usage:
+  python -m cyber_security.url_analyzer https://example.com
+"""
+from __future__ import annotations
+
+import argparse
+import idna  # type: ignore
+import re
+import sys
+from urllib.parse import urlparse
+
+
+SUSPICIOUS_TLDS = {
+    "zip",
+    "mov",
+    "top",
+    "xyz",
+}
+
+
+def is_punycode(label: str) -> bool:
+    return label.lower().startswith("xn--")
+
+
+def has_mixed_scripts(label: str) -> bool:
+    # crude mixed script detection: Latin + Cyrillic/Greek
+    latin = re.search(r"[A-Za-z]", label)
+    cyrillic = re.search(r"[\u0400-\u04FF]", label)
+    greek = re.search(r"[\u0370-\u03FF]", label)
+    scripts = sum(bool(x) for x in (latin, cyrillic, greek))
+    return scripts >= 2
+
+
+def looks_like_homograph(label: str) -> bool:
+    # very rough: replace common confusables and see if it becomes a known brand
+    confusable_map = {
+        "0": "o",
+        "1": "l",
+        "3": "e",
+        "5": "s",
+        "@": "a",
+        "$": "s",
+        "!": "i",
+        "і": "i",  # Cyrillic i
+        "е": "e",  # Cyrillic e
+        "о": "o",  # Cyrillic o
+    }
+    brands = {"google", "apple", "microsoft", "paypal", "facebook", "amazon"}
+    norm = label.lower()
+    for k, v in confusable_map.items():
+        norm = norm.replace(k, v)
+    return any(b in norm for b in brands)
+
+
+def analyze(url: str) -> list[str]:
+    issues: list[str] = []
+    try:
+        parsed = urlparse(url)
+    except Exception:
+        return ["Invalid URL format."]
+
+    if parsed.scheme not in {"http", "https"}:
+        issues.append("Non-HTTP(S) scheme.")
+
+    hostname = parsed.hostname or ""
+    if not hostname:
+        issues.append("Missing hostname.")
+        return issues
+
+    labels = hostname.split(".")
+    decoded_labels = []
+    for label in labels:
+        if is_punycode(label):
+            issues.append("Punycode present (possible IDN homograph).")
+            try:
+                decoded_labels.append(idna.decode(label))
+            except Exception:
+                decoded_labels.append(label)
+        else:
+            decoded_labels.append(label)
+
+    decoded_host = ".".join(decoded_labels)
+
+    for label in decoded_labels:
+        if has_mixed_scripts(label):
+            issues.append("Mixed Unicode scripts in hostname label.")
+        if looks_like_homograph(label):
+            issues.append("Label resembles well-known brand (possible homograph).")
+
+    tld = labels[-1].lower() if labels else ""
+    if tld in SUSPICIOUS_TLDS:
+        issues.append(f"Suspicious TLD: .{tld}")
+
+    # URL path query heuristics
+    path_query = (parsed.path or "") + ("?" + parsed.query if parsed.query else "")
+    if re.search(r"(?i)login|verify|update|secure|account", path_query):
+        issues.append("Phishing related keyword in path/query.")
+    if re.search(r"@", parsed.netloc):
+        issues.append("@ in netloc may hide true host (userinfo trick).")
+
+    return issues
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Analyze a URL for phishing/IDN risks.")
+    parser.add_argument("url", help="URL to analyze")
+    parser.add_argument("--quiet", action="store_true", help="Exit code only: 0 safe-ish, 1 suspicious")
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(sys.argv[1:] if argv is None else argv)
+    issues = analyze(args.url)
+
+    if args.quiet:
+        return 1 if issues else 0
+
+    if issues:
+        print("Suspicious indicators:")
+        for item in issues:
+            print(f" - {item}")
+        return 1
+
+    print("No obvious issues detected.")
+    return 0
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ sympy
 tweepy
 typing_extensions
 xgboost
+idna

-Original file line number
+Diff line change
 tweepy
 typing_extensions
 xgboost
 +idna