From dbbaecdf28889759a9fdf2c0a0fa846fd0ce1dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Pir=C3=B3g?= Date: Fri, 10 Apr 2026 17:25:44 +0200 Subject: [PATCH 01/16] elixir: custom expression-to-pattern --- languages/elixir/generic/Elixir_to_generic.ml | 52 +++++++++++++++---- libs/ast_generic/AST_generic_helpers.ml | 4 +- .../elixir/taint-pin-pattern.ex | 7 +++ .../elixir/taint-pin-pattern.yaml | 12 +++++ .../elixir/taint-string-concat-pattern.ex | 4 ++ .../elixir/taint-string-concat-pattern.yaml | 17 ++++++ 6 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 tests/tainting_rules/elixir/taint-pin-pattern.ex create mode 100644 tests/tainting_rules/elixir/taint-pin-pattern.yaml create mode 100644 tests/tainting_rules/elixir/taint-string-concat-pattern.ex create mode 100644 tests/tainting_rules/elixir/taint-string-concat-pattern.yaml diff --git a/languages/elixir/generic/Elixir_to_generic.ml b/languages/elixir/generic/Elixir_to_generic.ml index 9be473fe1..076aa033a 100644 --- a/languages/elixir/generic/Elixir_to_generic.ml +++ b/languages/elixir/generic/Elixir_to_generic.ml @@ -118,6 +118,40 @@ let expr_of_expr_or_kwds (x : (G.expr, keywords_generic) Either_.t) : G.expr = | Left e -> e | Right kwds -> list_container_of_kwds kwds +(* This is a modified version of Ast_generic_helpers.expr_to_pattern *) +let rec expr_to_pattern (e : G.expr) : G.pattern = + match e.e with + | G.N (G.Id (id, info)) -> G.PatId (id, info) + | G.Container (G.Tuple, (t1, xs, t2)) -> + G.PatTuple (t1, List_.map expr_to_pattern xs, t2) + | G.L l -> G.PatLiteral l + | G.Container ((List | Dict), (t1, xs, t2)) -> + G.PatList (t1, List_.map expr_to_pattern xs, t2) + | G.Constructor (n, (_, args, _)) -> + G.PatConstructor (n, List_.map expr_to_pattern args) + | G.Ellipsis t -> G.PatEllipsis t + | G.OtherExpr (tag, [ G.E e ]) -> G.OtherPat (tag, [ G.P (expr_to_pattern e) ]) + | G.Cast (ty, _tok, expr) -> G.PatTyped (expr_to_pattern expr, ty) + | G.LetPattern (p, {e = G.N (G.Id (i, info)); _} ) -> G.PatAs (p, (i, info)) + | G.Call (f, args) -> + begin match f.e, Tok.unbracket args with + | G.N (G.Id (("<>", _), _) as n), + [ G.Arg ({ e = G.L (G.String _); _ } as l); + G.Arg ({ e = G.N _; _ } as r) ] -> + G.PatConstructor (n, [ expr_to_pattern l; expr_to_pattern r ]) + | G.N (G.Id (("^", _), _)), + [ G.Arg ({ e = G.N _; _ } as rhs) ] -> + let tmp = "__tmp", Tok.unsafe_fake_tok "__tmp" in + let tmp_info = G.empty_id_info ~hidden:true () in + let lhs = G.N (G.Id (tmp, tmp_info)) |> G.e in + let op = G.IdSpecial (G.Op G.Eq, Tok.unsafe_fake_tok "==") |> G.e in + let cmp = G.Call (op, Tok.unsafe_fake_bracket [ G.Arg lhs; G.Arg rhs ]) |> G.e in + G.PatWhen (G.PatId (tmp, tmp_info), cmp) + | _ -> OtherPat (("ExprToPattern", Tok.unsafe_fake_tok ""), [ G.E e ]) + end + (* TODO: PatKeyVal and more *) + | _ -> OtherPat (("ExprToPattern", Tok.unsafe_fake_tok ""), [ G.E e ]) + (* TODO: lots of work here to detect when args is really a single * pattern, or tuples *) let pat_of_args_and_when (args, when_opt) : G.pattern = @@ -129,8 +163,8 @@ let pat_of_args_and_when (args, when_opt) : G.pattern = let pats = List_.map (function - | G.OtherArg (("ArgKwdQuoted", _), [ G.E e ]) -> H.expr_to_pattern e - | arg -> H.argument_to_expr arg |> H.expr_to_pattern) + | G.OtherArg (("ArgKwdQuoted", _), [ G.E e ]) -> expr_to_pattern e + | arg -> H.argument_to_expr arg |> expr_to_pattern) args in let pat = @@ -432,7 +466,7 @@ and map_stmt env (v : stmt) : G.stmt = let comp_clauses = List_.map (fun (clause : for_clause) -> match clause with | ForGenerator (pat, tarrow, collection) -> - let pat = map_expr env pat |> H.expr_to_pattern in + let pat = map_expr env pat |> expr_to_pattern in let collection = map_expr env collection in G.CompFor (tfor, pat, tarrow, collection) | ForFilter e -> @@ -494,12 +528,12 @@ and map_param_to_gparam env (p : parameter) : G.parameter = G.Param (G.param_of_id ?pdefault id)) | OtherParamExpr e -> let e = map_expr env e in - G.ParamPattern (H.expr_to_pattern e) + G.ParamPattern (expr_to_pattern e) | OtherParamPair (kwd, e) -> let kwd = map_keyword env kwd in let e = map_expr env e in let e = keyval_of_pair (Left (kwd, e)) in - G.ParamPattern (H.expr_to_pattern e) + G.ParamPattern (expr_to_pattern e) (* Convert one rescue/catch stab clause to a G.catch arm. * Each stab has a list of exception-type expressions and a handler body. *) @@ -510,9 +544,9 @@ and map_rescue_stab_to_catch env tok (stab : stab_clause) : G.catch = | [] -> G.PatEllipsis tok | [arg] -> let e = map_expr env arg in - H.expr_to_pattern e + expr_to_pattern e | args -> - let pats = List_.map (fun a -> H.expr_to_pattern (map_expr env a)) args in + let pats = List_.map (fun a -> expr_to_pattern (map_expr env a)) args in let pat = List.fold_right (fun p acc -> G.DisjPat (p, acc)) (List.tl pats) (List.hd pats) @@ -708,10 +742,10 @@ and map_vardef env v1 v2 = (* TODO: Elixir also has these patterns: * ^x = 0 meaning x cannot be re-assigned later, and * [x|y] = [0, 1, 2] where x maps to 0, and y maps to the rest - * and H.expr_to_pattern doesn't cover these cases. + * and expr_to_pattern doesn't cover these cases. *) and map_letpattern env v1 v2 = - let e1 = H.expr_to_pattern (map_expr env v1) in + let e1 = expr_to_pattern (map_expr env v1) in let e2 = map_expr env v2 in G.LetPattern (e1, e2) |> G.e diff --git a/libs/ast_generic/AST_generic_helpers.ml b/libs/ast_generic/AST_generic_helpers.ml index ea1ac9ca0..d24f8b09c 100644 --- a/libs/ast_generic/AST_generic_helpers.ml +++ b/libs/ast_generic/AST_generic_helpers.ml @@ -199,9 +199,7 @@ let rec expr_to_pattern e = | Container (Tuple, (t1, xs, t2)) -> PatTuple (t1, xs |> List_.map expr_to_pattern, t2) | L l -> PatLiteral l - | Container (List, (t1, xs, t2)) -> - PatList (t1, xs |> List_.map expr_to_pattern, t2) - | Container (Dict, (t1, xs, t2)) -> + | Container ((List | Dict), (t1, xs, t2)) -> PatList (t1, xs |> List_.map expr_to_pattern, t2) | Constructor (n, (_, args, _)) -> PatConstructor (n, args |> List_.map expr_to_pattern) diff --git a/tests/tainting_rules/elixir/taint-pin-pattern.ex b/tests/tainting_rules/elixir/taint-pin-pattern.ex new file mode 100644 index 000000000..6c491d631 --- /dev/null +++ b/tests/tainting_rules/elixir/taint-pin-pattern.ex @@ -0,0 +1,7 @@ +def foo() do + val x = source() + case foo() do + #ruleid: taint + ^x -> sink(x) + end +end diff --git a/tests/tainting_rules/elixir/taint-pin-pattern.yaml b/tests/tainting_rules/elixir/taint-pin-pattern.yaml new file mode 100644 index 000000000..0757e34dd --- /dev/null +++ b/tests/tainting_rules/elixir/taint-pin-pattern.yaml @@ -0,0 +1,12 @@ +rules: +- id: taint + languages: [elixir] + message: "tainted data reached sink" + severity: ERROR + mode: taint + pattern-sources: + - pattern: | + source() + pattern-sinks: + - pattern: | + sink(...) diff --git a/tests/tainting_rules/elixir/taint-string-concat-pattern.ex b/tests/tainting_rules/elixir/taint-string-concat-pattern.ex new file mode 100644 index 000000000..749e1c572 --- /dev/null +++ b/tests/tainting_rules/elixir/taint-string-concat-pattern.ex @@ -0,0 +1,4 @@ +def foo("a" <> x) do + #ruleid: taint + sink(x) +end diff --git a/tests/tainting_rules/elixir/taint-string-concat-pattern.yaml b/tests/tainting_rules/elixir/taint-string-concat-pattern.yaml new file mode 100644 index 000000000..58eb1c4c6 --- /dev/null +++ b/tests/tainting_rules/elixir/taint-string-concat-pattern.yaml @@ -0,0 +1,17 @@ +rules: +- id: taint + languages: [elixir] + message: "tainted data reached sink" + severity: ERROR + mode: taint + pattern-sources: + - patterns: + - pattern-either: + - pattern-inside: | + def $_(..., $X, ...) do + ... + end + - focus-metavariable: $X + pattern-sinks: + - pattern: | + sink(...) From ec51c9041d2d4b7646476c4cf65ad63289c86ec4 Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Tue, 14 Apr 2026 11:10:07 +0100 Subject: [PATCH 02/16] remove unused function `Call_graph.node_key` --- src/call_graph/Call_graph.ml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/call_graph/Call_graph.ml b/src/call_graph/Call_graph.ml index fc46f1740..09be06bbe 100644 --- a/src/call_graph/Call_graph.ml +++ b/src/call_graph/Call_graph.ml @@ -63,11 +63,6 @@ module Dot = Graph.Graphviz.Dot (Display) module Topo = Graph.Topological.Make (G) module SCC = Graph.Components.Make (G) -let node_key (n : node) = - let name = Function_id.show n in - let filename, line, col = Function_id.to_file_line_col n in - Printf.sprintf "%s|%s|%d|%d" name filename line col - (** Helpers **) let pos_of_tok (tok : Tok.t) : Pos.t = From b26d0fc7fd0bdeb75ccaf52e57cb7eceb8b286d8 Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Tue, 14 Apr 2026 11:27:13 +0100 Subject: [PATCH 03/16] clojure: support string-key map destructuring Accept `{sym "str-key"}` forms (and mixed with keywords) in function parameters and `let` bindings. The parser previously only handled keyword values and raised "Invalid map binding form" for string keys. Also extend the Clojure-specific PatKeyVal case in AST_to_IL so the string-literal value pattern registers the PatId binding, letting taint propagate through the destructured variable. --- .../tree-sitter/Parse_clojure_tree_sitter.ml | 23 ++++++++++++------- src/analyzing/AST_to_IL.ml | 5 ++++ .../clojure/map_destructuring_string_keys.clj | 11 +++++++++ .../clojure/taint-propagation.clj | 14 +++++++++++ 4 files changed, 45 insertions(+), 8 deletions(-) create mode 100644 tests/parsing/clojure/map_destructuring_string_keys.clj diff --git a/languages/lisp/tree-sitter/Parse_clojure_tree_sitter.ml b/languages/lisp/tree-sitter/Parse_clojure_tree_sitter.ml index 908d874c6..9f8bc7bf7 100644 --- a/languages/lisp/tree-sitter/Parse_clojure_tree_sitter.ml +++ b/languages/lisp/tree-sitter/Parse_clojure_tree_sitter.ml @@ -1111,17 +1111,24 @@ and map_binding_form_map_lit (env : env) ((_meta, (lb, srcs, rb)) : CST.map_lit) in with_or_as s (token env tk) pats rest - (* Standard map binding, eg, {x :a, [y z] :b}. *) - | _bind_form :: `Kwd_lit _ :: _ -> - let rec keyval_and_rest acc = function - | bind_form :: `Kwd_lit kwd_lit :: rest_forms -> - let key = map_binding_form env bind_form in - (* TODO: PatRecord of (dotted_ident * pattern) list bracket *) + (* Standard map binding, eg, {x :a, [y z] :b, z "str-key"}. *) + | _bind_form :: (`Kwd_lit _ | `Str_lit _) :: _ -> + let map_value_key_pattern = function + | `Kwd_lit kwd_lit -> let atom_kind, tok_colon, atom_name = map_kwd_expr_aux env kwd_lit in - let value = G.OtherPat ((atom_kind, tok_colon), [G.Name atom_name]) in - (* let value = G.PatLiteral (map_kwd_lit env kwd_lit) in *) + G.OtherPat ((atom_kind, tok_colon), [G.Name atom_name]) + | `Str_lit str_tok -> + let s, t = H.str env str_tok in + let s_no_quotes = String.sub s 1 (String.length s - 2) in + G.PatLiteral (G.String (Tok.unsafe_fake_bracket (s_no_quotes, t))) + in + let rec keyval_and_rest acc = function + | bind_form :: (`Kwd_lit _ | `Str_lit _ as kv) :: rest_forms -> + let key = map_binding_form env bind_form in + (* TODO: PatRecord of (dotted_ident * pattern) list bracket *) + let value = map_value_key_pattern kv in keyval_and_rest (G.PatKeyVal (key, value) :: acc) rest_forms diff --git a/src/analyzing/AST_to_IL.ml b/src/analyzing/AST_to_IL.ml index 31762f98e..95dcf942a 100644 --- a/src/analyzing/AST_to_IL.ml +++ b/src/analyzing/AST_to_IL.ml @@ -411,6 +411,11 @@ and pattern env pat : stmts * lval * stmts = [G.Name _atom_name])) when env.lang =*= Lang.Clojure -> pattern env key_pat + (* Clojure string-key destructuring, e.g. `(let [{x "a"} o] x)`. The value + * is a string literal used as the map lookup key; only `key_pat` binds. *) + | G.PatKeyVal (key_pat, G.PatLiteral (G.String _)) + when env.lang =*= Lang.Clojure -> + pattern env key_pat (* Only seems to be used in Ruby, modulo the above case for Clojure. *) | G.PatKeyVal (_key_pat, val_pat) when env.lang =*= Lang.Ruby -> (* My understanding is that the new variables are introduced on the rhs. *) diff --git a/tests/parsing/clojure/map_destructuring_string_keys.clj b/tests/parsing/clojure/map_destructuring_string_keys.clj new file mode 100644 index 000000000..03900f43e --- /dev/null +++ b/tests/parsing/clojure/map_destructuring_string_keys.clj @@ -0,0 +1,11 @@ +;; Parsing test: map destructuring with string keys. + +(defn f [{x "a"}] x) + +(defn g [{x "a" y "b"}] [x y]) + +(defn h [{x :kw y "str"}] [x y]) + +(let [{x "a"} {"a" 1}] x) + +(defn i [{x "a" :as opts :or {x 0}}] [x opts]) diff --git a/tests/tainting_rules/clojure/taint-propagation.clj b/tests/tainting_rules/clojure/taint-propagation.clj index 72b07b076..6c84449ac 100644 --- a/tests/tainting_rules/clojure/taint-propagation.clj +++ b/tests/tainting_rules/clojure/taint-propagation.clj @@ -101,6 +101,20 @@ ;; ruleid: taint-call (sink y1)) +;; map destructuring with string keys +(defn f [{x "a"}] + ;; ruleid: taint-call + (sink x)) + +(defn f [{x "a" y "b"}] + ;; ruleid: taint-call + (sink y)) + +;; mixed keyword and string keys +(defn f [{x :kw y "str"}] + ;; ruleid: taint-call + (sink y)) + (defn f [{:syms [::x y] :as opts}] (if opts ;; ruleid: taint-call From 8c9b2c95e94fc02c2ac6a8018692b98d46c8f8ee Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Tue, 14 Apr 2026 14:31:33 +0100 Subject: [PATCH 04/16] elixir: distinguish field access from zero-arity remote call `foo.bar` (no parens, no args, no do-block) is map/struct field access in Elixir -- semantically distinct from `foo.bar(...)` which is a remote function call. The parser was collapsing both into `Call(DotAccess, [])`, causing `taint_assume_safe_functions: true` to silently drop taint at every field access on a tainted receiver. The tree-sitter grammar already separates the two via distinct non-terminals, so route each CST case to a distinct AST variant: - Add `FieldAccess of remote_dot` alongside `DotRemote of remote_dot` in AST_elixir. - In the parser, emit `FieldAccess rdot` for the no-parens/no-args/ no-do-block case; keep `Call (mk_call_no_parens (Right rdot) ...)` otherwise. - In Elixir_to_generic, translate `FieldAccess` to `G.DotAccess`, the same shape used by other languages for field access. Explicit `foo.bar()` still becomes `Call(DotAccess, [])` and remains subject to `taint_assume_safe_functions`, as intended. --- languages/elixir/ast/AST_elixir.ml | 4 ++ languages/elixir/generic/Elixir_to_generic.ml | 4 ++ .../tree-sitter/Parse_elixir_tree_sitter.ml | 32 +++++++------- .../elixir/taint-field-access.ex | 42 +++++++++++++++++++ .../elixir/taint-field-access.yaml | 17 ++++++++ 5 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 tests/tainting_rules/elixir/taint-field-access.ex create mode 100644 tests/tainting_rules/elixir/taint-field-access.yaml diff --git a/languages/elixir/ast/AST_elixir.ml b/languages/elixir/ast/AST_elixir.ml index 951e421ad..44db3173b 100644 --- a/languages/elixir/ast/AST_elixir.ml +++ b/languages/elixir/ast/AST_elixir.ml @@ -209,6 +209,10 @@ and expr = | DotAnon of expr * tok (* only inside Call *) | DotRemote of remote_dot + (* Elixir map/struct field access: `foo.bar` with no parens, no args, + * no do-block. Distinct from `foo.bar(...)` which is a remote call + * (encoded as `Call (DotRemote _, _, _)`). *) + | FieldAccess of remote_dot | ModuleVarAccess of tok (* @ *) * expr | ArrayAccess of expr * expr bracket (* a Call can be a thousand things, including function and module definitions diff --git a/languages/elixir/generic/Elixir_to_generic.ml b/languages/elixir/generic/Elixir_to_generic.ml index 076aa033a..a8102d64f 100644 --- a/languages/elixir/generic/Elixir_to_generic.ml +++ b/languages/elixir/generic/Elixir_to_generic.ml @@ -878,6 +878,10 @@ and map_expr env v : G.expr = G.OtherExpr (("DotAnon", tdot), [ G.E e ]) |> G.e (* only inside a Call *) | DotRemote v -> map_remote_dot env v + (* Elixir field access: `foo.bar` (no parens). Translate to a plain + * DotAccess so downstream analysis treats it as field access, not a + * zero-arity function call. *) + | FieldAccess v -> map_remote_dot env v | ModuleVarAccess (tat, v2) -> let e = map_expr env v2 in G.OtherExpr (("AttrExpr", tat), [ G.E e ]) |> G.e diff --git a/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml b/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml index 80ae8926f..2731d66a9 100644 --- a/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml +++ b/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml @@ -546,10 +546,10 @@ and map_body (env : env) ((v1, v2, v3, v4) : CST.body) : body = let _v4 = map_terminator_opt env v4 in v2 :: v3 -and map_call (env : env) (x : CST.call) : call = +and map_call (env : env) (x : CST.call) : expr = match x with | `Call_with_parens_b98484c x -> map_call_without_parentheses env x - | `Call_with_parens_403315d x -> map_call_with_parentheses env x + | `Call_with_parens_403315d x -> Call (map_call_with_parentheses env x) and map_call_arguments_with_parentheses (env : env) ((v1, v2, v3) : CST.call_arguments_with_parentheses) : arguments bracket = @@ -627,26 +627,32 @@ and map_call_with_parentheses (env : env) (x : CST.call_with_parentheses) : call mk_call_parens (Call call1) args blopt and map_call_without_parentheses (env : env) (x : CST.call_without_parentheses) - : call = + : expr = match x with | `Local_call_with_parens (v1, v2, v3) -> let id = map_identifier env v1 in let args = map_call_arguments_without_parentheses env v2 in let blopt = map_anon_opt_opt_nl_before_do_do_blk_3eff85f env v3 in - mk_call_no_parens (Left id) args blopt + Call (mk_call_no_parens (Left id) args blopt) | `Local_call_just_do_blk (v1, v2) -> let id = map_identifier env v1 in let bl = map_do_block env v2 in - mk_call_no_parens (Left id) ([], []) (Some bl) + Call (mk_call_no_parens (Left id) ([], []) (Some bl)) | `Remote_call_with_parens (v1, v2, v3) -> let rdot = map_remote_dot env v1 in - let args : arguments = - match v2 with - | Some x -> map_call_arguments_without_parentheses env x - | None -> ([], []) - in let blopt = map_anon_opt_opt_nl_before_do_do_blk_3eff85f env v3 in - mk_call_no_parens (Right rdot) args blopt + (match v2, blopt with + | None, None -> + (* Elixir map/struct field access: `foo.bar` with no parens, + * no args, no do-block. Not a function call. *) + FieldAccess rdot + | _ -> + let args : arguments = + match v2 with + | Some x -> map_call_arguments_without_parentheses env x + | None -> ([], []) + in + Call (mk_call_no_parens (Right rdot) args blopt)) and map_capture_expression (env : env) (x : CST.capture_expression) = match x with @@ -884,9 +890,7 @@ and map_expression (env : env) (x : CST.expression) : expr = | `Un_op x -> map_unary_operator env x | `Bin_op x -> map_binary_operator env x | `Dot x -> map_dot env x - | `Call x -> - let c = map_call env x in - Call c + | `Call x -> map_call env x (* semantic: transformed in Access.get/2 *) | `Access_call (v1, v2, v3, v4) -> let v1 = map_expression env v1 in diff --git a/tests/tainting_rules/elixir/taint-field-access.ex b/tests/tainting_rules/elixir/taint-field-access.ex new file mode 100644 index 000000000..37b174775 --- /dev/null +++ b/tests/tainting_rules/elixir/taint-field-access.ex @@ -0,0 +1,42 @@ +defmodule TaintFieldAccess do + # Field access `x.y` (no parens, no args, no do-block) must propagate + # taint even when `taint_assume_safe_functions: true`, because it is + # map/struct field access, not a zero-arity function call. + def field(upload) do + # ruleid: taint-field-access + sink(upload.path) + end + + # Chained field access. + def chained(conn) do + # ruleid: taint-field-access + sink(conn.assigns.current_user) + end + + # Zero-arity remote call `x.y()` IS a function call; under + # `taint_assume_safe_functions: true` the taint is dropped. + def zero_arity_call(upload) do + # ok: taint-field-access + sink(upload.path()) + end + + # Remote call with args — also a call, taint dropped. + def call_with_args(upload) do + # ok: taint-field-access + sink(upload.compute(1, 2)) + end + + # Remote call without parens but with args — still a call. + def call_no_parens_args(upload) do + # ok: taint-field-access + sink(upload.compute 1, 2) + end + + # Remote call with do-block — a call. + def call_do_block(upload) do + # ok: taint-field-access + sink(upload.with_block do + :ok + end) + end +end diff --git a/tests/tainting_rules/elixir/taint-field-access.yaml b/tests/tainting_rules/elixir/taint-field-access.yaml new file mode 100644 index 000000000..ac5375bea --- /dev/null +++ b/tests/tainting_rules/elixir/taint-field-access.yaml @@ -0,0 +1,17 @@ +rules: +- id: taint-field-access + mode: taint + languages: [elixir] + message: "tainted field access reaches sink" + severity: INFO + options: + taint_assume_safe_functions: true + pattern-sources: + - patterns: + - pattern-inside: | + def $_(..., $P, ...) do + ... + end + - focus-metavariable: $P + pattern-sinks: + - pattern: sink(...) From 8241b0b83bb7df8ffae8db839392a89e60661b3d Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Tue, 14 Apr 2026 15:50:01 +0100 Subject: [PATCH 05/16] elixir: short-lambda for remote captures and HOF dot-callee Extend the `&fun/arity` ShortLambda match in the parser to include `FieldAccess _` so `&Mod.fun/arity` is recognised as a remote capture. In `Graph_from_AST.extract_callback_from_arg`, widen the inner-call callee pattern to accept `DotAccess(_, _, FN(Id))` alongside `N(Id _)`. `identify_callback` resolves the method name via `all_funcs`. Extend `test_hof_comprehensive_elixir.ex` with `&Mod.fun/1` cases via `Enum.map` and the custom HOF, plus a second `defmodule` for cross-module resolution. Co-authored-by: @corneliuhoffman --- .../tree-sitter/Parse_elixir_tree_sitter.ml | 2 +- src/tainting/Graph_from_AST.ml | 9 +++++--- .../test_hof_comprehensive_elixir.ex | 21 +++++++++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml b/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml index 2731d66a9..beba22f72 100644 --- a/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml +++ b/languages/elixir/tree-sitter/Parse_elixir_tree_sitter.ml @@ -674,7 +674,7 @@ and map_capture_expression (env : env) (x : CST.capture_expression) = | other -> other in (match actual_fun_name with - | I _ | Alias _ | DotAlias _ | DotRemote _ -> + | I _ | Alias _ | DotAlias _ | DotRemote _ | FieldAccess _ -> (* Convert &fun/arity to &(fun(&1, &2, ...)) *) let arity_int = Int64.to_int arity in (* Create PlaceHolder arguments: &1, &2, ... *) diff --git a/src/tainting/Graph_from_AST.ml b/src/tainting/Graph_from_AST.ml index 9e7f80ae8..06b5d89e5 100644 --- a/src/tainting/Graph_from_AST.ml +++ b/src/tainting/Graph_from_AST.ml @@ -546,13 +546,16 @@ let extract_callback_from_arg (arg_expr : G.expr) : (IL.name * Tok.t * IL.name o | G.DotAccess (_, _, G.FN (G.Id (id, id_info))) -> let callback_name = AST_to_IL.var_of_id_info id id_info in Some (callback_name, snd id, None) - (* Elixir: &func/n - ShortLambda wrapping a call to the named function. - Structure: OtherExpr("ShortLambda", [Params[&1,...]; S(ExprStmt(Call(func, args)))]) + (* Elixir: &func/n or &Mod.func/n - ShortLambda wrapping a call to the + named (local or remote) function. Structure: + OtherExpr("ShortLambda", [Params[&1,...]; S(ExprStmt(Call(func, args)))]) + where func is either a plain Id or a DotAccess(..., FN(Id)). Create a _tmp node to match what AST_to_IL creates for the anonymous wrapper. *) | G.OtherExpr (("ShortLambda", shortlambda_tok), [G.Params _; G.S { G.s = G.ExprStmt (inner_e, _); _ }]) -> (match inner_e.G.e with - | G.Call ({ e = G.N (G.Id (id, id_info)); _ }, _) -> + | G.Call ({ e = G.N (G.Id (id, id_info)) + | G.DotAccess (_, _, G.FN (G.Id (id, id_info))); _ }, _) -> let callback_name = AST_to_IL.var_of_id_info id id_info in (* Create _tmp IL.name using Tok.fake_tok like AST_to_IL.fresh_var does *) let tmp_tok = Tok.fake_tok shortlambda_tok "_tmp" in diff --git a/tests/rules/cross_function_tainting/test_hof_comprehensive_elixir.ex b/tests/rules/cross_function_tainting/test_hof_comprehensive_elixir.ex index 40ce3f911..bde811e49 100644 --- a/tests/rules/cross_function_tainting/test_hof_comprehensive_elixir.ex +++ b/tests/rules/cross_function_tainting/test_hof_comprehensive_elixir.ex @@ -248,6 +248,27 @@ defmodule TestHOF do # Top-level user-defined HOF custom_for_each(toplevel_items, &toplevel_handler/1) end + + # Remote-capture short lambdas `&Mod.fun/arity`: the left of `/` is a + # dot expression, exercising the ShortLambda conversion path for + # remote dots (FieldAccess) -- local captures `&fn/arity` do not. + def test_remote_capture_builtin() do + arr = [source()] + mapped = Enum.map(arr, &RemoteHelper.process_remote/1) + end + + def test_remote_capture_custom() do + arr = [source()] + mapped = custom_map_builtin(arr, &RemoteHelper.process_remote/1) + end +end + +defmodule RemoteHelper do + def process_remote(x) do + # ruleid: test-hof-taint + sink(x) + x + end end def toplevel_handler(x) do From ff557139dad9a6b25b9f864665625300ab9737bd Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Wed, 15 Apr 2026 09:51:04 +0100 Subject: [PATCH 06/16] nuitka: bundle charset_normalizer Commit a0b26ed13 dropped chardet without replacing it; Nuitka does not follow the try/except import in requests/__init__.py, so Windows builds since 1.17.0 emit a RequestsDependencyWarning on every run (#656). --- scripts/build-nuitka.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/build-nuitka.sh b/scripts/build-nuitka.sh index 2d4a35260..e04f76cde 100755 --- a/scripts/build-nuitka.sh +++ b/scripts/build-nuitka.sh @@ -60,6 +60,7 @@ pushd cli --include-data-dir="$SRC_SEMGREP_DIR/templates=semgrep/templates" \ --include-data-file="$SRC_SEMGREP_DIR/semgrep_interfaces/lang.json=semgrep/semgrep_interfaces/lang.json" \ --include-data-file="$SRC_SEMGREP_DIR/semgrep_interfaces/rule_schema_v1.yaml=semgrep/semgrep_interfaces/rule_schema_v1.yaml" \ + --include-package=charset_normalizer \ --no-deployment-flag=self-execution \ --windows-icon-from-ico=spec/opengrep.ico \ --linux-icon=spec/opengrep.ico \ From 8527a603db2e75f8affe00a10560f592de94f049 Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Wed, 15 Apr 2026 09:51:39 +0100 Subject: [PATCH 07/16] install.ps1: don't treat stderr warnings as fatal Route the --version probe's stderr through a temp file and gate on $LASTEXITCODE only, so runtime warnings surface via Write-Host instead of aborting install.ps1 with a NativeCommandError (#656). --- install.ps1 | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/install.ps1 b/install.ps1 index b168962cd..aebfff54a 100644 --- a/install.ps1 +++ b/install.ps1 @@ -333,9 +333,26 @@ function Main { } Write-Host "Testing binary..." - # Test by calling --version on the downloaded binary - $testOutput = & $binaryPath --version 2>&1 - if (-not $testOutput -or $LASTEXITCODE -ne 0) { + # Test by calling --version on the downloaded binary. + # We route stderr through a temp file rather than using `2>&1`, so + # that harmless runtime warnings (e.g. requests' RequestsDependency- + # Warning) are not surfaced by PowerShell as NativeCommandError + # records and misinterpreted as failures. We still surface them to + # the user via Write-Host, and rely on $LASTEXITCODE to decide + # whether the binary actually ran. + $stderrFile = New-TemporaryFile + try { + $testOutput = & $binaryPath --version 2>$stderrFile + $testExit = $LASTEXITCODE + $testStderr = (Get-Content -Raw -ErrorAction SilentlyContinue $stderrFile) + } + finally { + Remove-Item -Force -ErrorAction SilentlyContinue $stderrFile + } + if ($testStderr) { + Write-Host $testStderr + } + if ($testExit -ne 0 -or -not $testOutput) { throw "Failed to execute installed binary: $binaryPath" } From 23638143631dc4b9f89340346a84e4587dd9053c Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Wed, 18 Feb 2026 16:33:40 +0000 Subject: [PATCH 08/16] nested lambdas: added tests and initial solution --- src/analyzing/AST_to_IL.ml | 32 +++++++++- src/analyzing/Visit_function_defs.ml | 1 + src/call_graph/Function_id.ml | 23 ++++++- src/tainting/Dataflow_tainting.ml | 63 +++++++++++++------ src/tainting/Graph_from_AST.ml | 24 ++++--- .../test_hof_callback_taint_ruby.rb | 9 +-- .../test_lambda_deeply_nested.go | 37 +++++++++++ .../test_lambda_deeply_nested.yaml | 11 ++++ .../test_lambda_deeply_nested_java.java | 38 +++++++++++ .../test_lambda_deeply_nested_java.yaml | 11 ++++ .../test_lambda_deeply_nested_js.js | 35 +++++++++++ .../test_lambda_deeply_nested_js.yaml | 11 ++++ .../test_lambda_deeply_nested_php.php | 37 +++++++++++ .../test_lambda_deeply_nested_php.yaml | 11 ++++ .../test_lambda_deeply_nested_py.py | 25 ++++++++ .../test_lambda_deeply_nested_py.yaml | 11 ++++ .../test_lambda_deeply_nested_rust.rs | 35 +++++++++++ .../test_lambda_deeply_nested_rust.yaml | 11 ++++ .../test_lambda_nested_captured.go | 16 +++++ .../test_lambda_nested_captured.yaml | 11 ++++ .../test_lambda_nested_param.go | 16 +++++ .../test_lambda_nested_param.yaml | 11 ++++ .../test_lambda_no_taint.go | 14 +++++ .../test_lambda_no_taint.yaml | 11 ++++ .../test_lambda_param_flow.go | 13 ++++ .../test_lambda_param_flow.yaml | 11 ++++ .../test_lambda_simple_captured.go | 14 +++++ .../test_lambda_simple_captured.yaml | 11 ++++ 28 files changed, 517 insertions(+), 36 deletions(-) create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.js create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_php.php create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_php.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_py.py create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_py.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_rust.rs create mode 100644 tests/rules/cross_function_tainting/test_lambda_deeply_nested_rust.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_nested_captured.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_nested_captured.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_nested_param.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_nested_param.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_no_taint.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_no_taint.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_param_flow.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_param_flow.yaml create mode 100644 tests/rules/cross_function_tainting/test_lambda_simple_captured.go create mode 100644 tests/rules/cross_function_tainting/test_lambda_simple_captured.yaml diff --git a/src/analyzing/AST_to_IL.ml b/src/analyzing/AST_to_IL.ml index 95dcf942a..38d551aa3 100644 --- a/src/analyzing/AST_to_IL.ml +++ b/src/analyzing/AST_to_IL.ml @@ -858,6 +858,34 @@ and expr_aux env ?(void = false) g_expr : stmts * exp = |> G.e) ] in call_generic env ~void tok eorig e (Tok.unsafe_fake_bracket arg_container) + (* Ruby do-block flattening: `f(args) do |x| ... end` is parsed as + Call(Call(f, args), [Lambda]) but the block is semantically an argument + to f, not to its return value. Flatten into Call(f, args @ [Lambda]). *) + | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) + when env.lang =*= Lang.Ruby + && List.exists + (function + | G.Arg { G.e = G.Lambda _; _ } -> true + | _ -> false) + (Tok.unbracket outer_args) -> + let merged_args = + Tok.unsafe_fake_bracket + (Tok.unbracket inner_args @ Tok.unbracket outer_args) + in + expr_aux env ~void (G.Call (callee, merged_args) |> G.e) + (* Ruby: when the callee is a plain identifier (G.N), evaluate it via + `lval` instead of `expr` to skip the `ident_function_call_hack` (see + the G.N arm below, ~line 892). That hack wraps bare identifiers in a + 0-arg Call for Ruby (where `foo` can mean `foo()`), but here we already + have an explicit G.Call — going through `expr` would produce a spurious + nested Call(Call(f, []), args) instead of Call(f, args). *) + | G.Call (({ G.e = G.N _; _ } as e), args) when env.lang =*= Lang.Ruby -> + let tok = G.fake "call" in + let ss_callee, callee_lval = lval env e in + let callee_exp = mk_e (Fetch callee_lval) (related_exp e) in + let ss_args, il_args = arguments env (Tok.unbracket args) in + let ss_call, call_exp = call_instr tok eorig ~void (fun res -> Call (res, callee_exp, il_args)) in + (ss_callee @ ss_args @ ss_call, call_exp) | G.Call (e, args) -> let tok = G.fake "call" in call_generic env ~void tok eorig e args @@ -988,7 +1016,7 @@ and expr_aux env ?(void = false) g_expr : stmts * exp = | G.Comprehension (_op, (_l, (er, clauses), _r)) -> comprehension env er clauses | G.Lambda fdef -> - let lval = fresh_lval (snd fdef.fkind) in + let lval = fresh_lval ~str:"_tmp_lambda" (snd fdef.fkind) in let final_fdef = (* NOTE: Reset control-flow labels so that break/continue/recur from * the enclosing scope don't bleed into the lambda body. *) @@ -1408,7 +1436,7 @@ and record env ((_tok, origfields, _) as record_def) : stmts * exp = (* Some languages such as javascript allow function definitions in object literal syntax. *) | G.FuncDef fdef -> - let lval = fresh_lval (snd fdef.fkind) in + let lval = fresh_lval ~str:"_tmp_lambda" (snd fdef.fkind) in (* See NOTE about resetting control-flow labels for lambdas. *) let fdef = function_definition diff --git a/src/analyzing/Visit_function_defs.ml b/src/analyzing/Visit_function_defs.ml index f7e3bcf8f..98917f93e 100644 --- a/src/analyzing/Visit_function_defs.ml +++ b/src/analyzing/Visit_function_defs.ml @@ -251,6 +251,7 @@ class ['self] visitor_with_parent_path = self#visit_stmt f body) | G.DefStmt (ent, G.VarDef { vinit = Some { e = G.Lambda fdef; _ }; _ }) -> + (* Handle lambda assignments in class fields *) let class_il = Option.bind !current_class g_name_to_il_name in let func_il = entity_to_il_name ent in let visitor_parent_path, current_fn_id = diff --git a/src/call_graph/Function_id.ml b/src/call_graph/Function_id.ml index c4b96ba2b..abba14852 100644 --- a/src/call_graph/Function_id.ml +++ b/src/call_graph/Function_id.ml @@ -20,8 +20,21 @@ let normalize_file (file : Fpath.t) : string = Fpath.to_string (Fpath.normalize file) let key ((id, tok) : t) = + (* For lambda names (starting with "_tmp_lambda"), extract position even from + * fake tokens that have position info. This is important for distinguishing + * different lambdas that would otherwise all collide on the same name. + * For regular functions with fake tokens, use empty key + * to preserve the original matching behavior. *) + let is_lambda_name = String.starts_with ~prefix:"_tmp_lambda" id in if Tok.is_fake tok then - (id, "", 0, 0) + if is_lambda_name then + match Tok.loc_of_tok tok with + | Ok loc -> + (id, normalize_file loc.Tok.pos.file, loc.Tok.pos.line, loc.Tok.pos.column) + | _ -> + (id, "", 0, 0) + else + (id, "", 0, 0) else let file = Tok.file_of_tok tok in let line = Tok.line_of_tok tok in @@ -64,6 +77,12 @@ let show_debug (id, tok) : string = let of_il_name (n : IL.name) : t = n.IL.ident +(* Unlike [key], we don't gate on is_lambda_name here: this is only used for + display/serialization, not identity, so extracting position from any fake + token that has it is strictly better than returning "unknown". *) let to_file_line_col ((_, tok) : t) : string * int * int = - if Tok.is_fake tok then ("unknown", 0, 0) + if Tok.is_fake tok then + match Tok.loc_of_tok tok with + | Ok loc -> (normalize_file loc.Tok.pos.file, loc.Tok.pos.line, loc.Tok.pos.column) + | _ -> ("unknown", 0, 0) else (normalize_file (Tok.file_of_tok tok), Tok.line_of_tok tok, Tok.col_of_tok tok) diff --git a/src/tainting/Dataflow_tainting.ml b/src/tainting/Dataflow_tainting.ml index f0302660f..3d112f813 100644 --- a/src/tainting/Dataflow_tainting.ml +++ b/src/tainting/Dataflow_tainting.ml @@ -773,6 +773,17 @@ let lambdas_to_analyze_in_node env lambdas node = in Option.to_list unused_lambda_def @ lambdas_used_in_node lambdas node +(* Collect ALL lambdas recursively from a fun_cfg, in innermost-first order. + This ensures nested lambda signatures are extracted before their parents. *) +let rec collect_all_lambdas_innermost_first (fun_cfg : IL.fun_cfg) + : (IL.name * IL.fun_cfg) list = + IL.NameMap.fold (fun name lcfg results -> + (* First collect nested lambdas from this lambda *) + let nested = collect_all_lambdas_innermost_first lcfg in + (* Then add this lambda after its nested ones *) + results @ nested @ [(name, lcfg)] + ) fun_cfg.lambdas [] + (*****************************************************************************) (* Miscellaneous *) (*****************************************************************************) @@ -1527,7 +1538,27 @@ let check_function_call env fun_exp args (Display_IL.string_of_exp fun_exp) arity env.taint_inst.options.taint_intrafile); let sig_result = - if env.taint_inst.options.taint_intrafile then lookup_signature env fun_exp arity + if env.taint_inst.options.taint_intrafile then + let from_db = lookup_signature env fun_exp arity in + match from_db with + | Some _ -> from_db + | None -> + (* lookup_signature failed - check if callee has a Fun shape in lval_env. + * This handles the case where a lambda is assigned to a variable like: + * callback := func(x) { sink(x) } + * callback(source()) + * The signature is stored under the lambda's internal name (_tmp:N), + * but the variable 'callback' has the Fun shape from the assignment. *) + (match fun_exp.e with + | Fetch lval -> + (match Lval_env.find_lval env.lval_env lval with + | Some (S.Cell (_, S.Fun fun_sig)) -> + Log.debug (fun m -> + m "SIG_FROM_SHAPE: Found Fun shape for %s" + (Display_IL.string_of_exp fun_exp)); + Some fun_sig + | _ -> None) + | _ -> None) else None in match sig_result with @@ -2303,20 +2334,10 @@ let check_tainted_instr env instr : Taints.t * S.shape * Lval_env.t = (* Check if this is a call to a function parameter (either direct or via method) *) (match e_obj with | `Obj (_obj_taints, S.Arg _fun_arg) -> - (* This is a method call on a function parameter (e.g., callback.apply in Java). - * Treat it as invoking the callback. - * EXCEPTION: Ruby's .call method should NOT be treated this way during signature - * extraction, as it creates infinite recursion. Ruby blocks are handled via - * implicit lambda detection instead. *) - let is_ruby_call_method = - match (e.e, env.taint_inst.lang) with - | Fetch { base = _; rev_offset = [{ o = Dot method_name; _ }] }, lang - when Lang.(lang =*= Ruby) && fst method_name.ident = "call" -> true - | _ -> false - in - if not is_ruby_call_method then - effects_of_call_func_arg e (match e_obj with `Obj (_, shape) -> shape | `Fun -> e_shape) args_taints - |> record_effects { env with lval_env } + (* This is a method call on a function parameter (e.g., callback.apply in Java, + * callback.call in Ruby). Treat it as invoking the callback. *) + effects_of_call_func_arg e (match e_obj with `Obj (_, shape) -> shape | `Fun -> e_shape) args_taints + |> record_effects { env with lval_env } | _ -> effects_of_call_func_arg e e_shape args_taints |> record_effects { env with lval_env }); @@ -2879,13 +2900,17 @@ and (fixpoint : | None -> in_env else in_env in - (* Extract signatures for all lambdas in the function for HOF support *) + (* Extract signatures for all lambdas in the function for HOF support. + We collect ALL lambdas (including nested ones) in innermost-first order, + so nested lambda signatures are available when processing their parents. *) let signature_db_with_lambdas = if taint_inst.options.taint_intrafile then match signature_db with | Some db -> - IL.NameMap.fold - (fun lambda_name lambda_cfg acc_db -> + (* Collect all lambdas recursively, innermost first *) + let all_lambdas_list = collect_all_lambdas_innermost_first fun_cfg in + List.fold_left + (fun acc_db (lambda_name, lambda_cfg) -> try Log.debug (fun m -> m "Extracting signature for lambda %s" @@ -2982,7 +3007,7 @@ and (fixpoint : (IL.str_of_name lambda_name) (Printexc.to_string e)); acc_db) - fun_cfg.lambdas db + db all_lambdas_list |> Option.some | None -> signature_db else signature_db diff --git a/src/tainting/Graph_from_AST.ml b/src/tainting/Graph_from_AST.ml index 06b5d89e5..86f62e833 100644 --- a/src/tainting/Graph_from_AST.ml +++ b/src/tainting/Graph_from_AST.ml @@ -121,12 +121,12 @@ let fn_id_of_entity ~(lang : Lang.t) (opt_ent : G.entity option) Some (adjusted_parent_path @ [Some name]) | None -> None) | None -> - (* Anonymous function - use _tmp with fake token to match AST_to_IL behavior. - AST_to_IL.fresh_var creates fake tokens for _tmp variables. *) + (* Anonymous function - use _tmp_lambda with fake token to match AST_to_IL behavior. + AST_to_IL.fresh_var creates fake tokens for lambda variables. *) let tok = match fdef.fkind with (_, tok) -> tok in - let fake_tok = Tok.fake_tok tok "_tmp" in + let fake_tok = Tok.fake_tok tok "_tmp_lambda" in let tmp_name = IL.{ - ident = ("_tmp", fake_tok); + ident = ("_tmp_lambda", fake_tok); sid = G.SId.unsafe_default; id_info = G.empty_id_info (); } in @@ -557,10 +557,10 @@ let extract_callback_from_arg (arg_expr : G.expr) : (IL.name * Tok.t * IL.name o | G.Call ({ e = G.N (G.Id (id, id_info)) | G.DotAccess (_, _, G.FN (G.Id (id, id_info))); _ }, _) -> let callback_name = AST_to_IL.var_of_id_info id id_info in - (* Create _tmp IL.name using Tok.fake_tok like AST_to_IL.fresh_var does *) - let tmp_tok = Tok.fake_tok shortlambda_tok "_tmp" in + (* Create _tmp_lambda IL.name using Tok.fake_tok like AST_to_IL.fresh_var does *) + let tmp_tok = Tok.fake_tok shortlambda_tok "_tmp_lambda" in let tmp_name = IL.{ - ident = ("_tmp", tmp_tok); + ident = ("_tmp_lambda", tmp_tok); sid = G.SId.unsafe_default; id_info = G.empty_id_info (); } in @@ -700,6 +700,16 @@ let extract_hof_callbacks ?(_object_mappings = []) ?(all_funcs = []) inherit [_] G.iter as super method! visit_expr env e = (match e.G.e with + (* Ruby/Scala block pattern: f(args) { block } is Call(Call(callee, inner_args), [block]). + Merge inner_args and block args so the HOF detection sees all arguments together. *) + | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) -> + let merged_args = Tok.unsafe_fake_bracket + (Tok.unbracket inner_args @ Tok.unbracket outer_args) in + let found = extract_hof_callbacks_from_call + ~method_hofs ~function_hofs ~all_funcs ~caller_parent_path + callee merged_args + in + callbacks := found @ !callbacks | G.Call (callee, args) -> let found = extract_hof_callbacks_from_call ~method_hofs ~function_hofs ~all_funcs ~caller_parent_path diff --git a/tests/rules/cross_function_tainting/test_hof_callback_taint_ruby.rb b/tests/rules/cross_function_tainting/test_hof_callback_taint_ruby.rb index 6bc87f7f9..158c8c7d3 100644 --- a/tests/rules/cross_function_tainting/test_hof_callback_taint_ruby.rb +++ b/tests/rules/cross_function_tainting/test_hof_callback_taint_ruby.rb @@ -22,8 +22,7 @@ def app_with_direct_flow(f, x) # === Callback-only HOF tests === def test_callback_only_propagating_lambda() - # todoruleid: test-hof-callback-taint - # TODO: Ruby lambda callback not yet working + # ruleid: test-hof-callback-taint sink(app_callback_only(->(x) { x }, source())) end @@ -37,14 +36,12 @@ def test_callback_only_propagating_lambda() # === Direct flow HOF tests (taint always flows via + x) === def test_direct_flow_propagating_lambda() - # todoruleid: test-hof-callback-taint - # TODO: Ruby lambda callback not yet working + # ruleid: test-hof-callback-taint sink(app_with_direct_flow(->(x) { x }, source())) end def test_direct_flow_sanitizing_lambda() - # todoruleid: test-hof-callback-taint - # TODO: Ruby lambda callback not yet working - but taint should flow via + x + # ruleid: test-hof-callback-taint sink(app_with_direct_flow(->(x) { "3" }, source())) end diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested.go b/tests/rules/cross_function_tainting/test_lambda_deeply_nested.go new file mode 100644 index 000000000..ebf8f12fd --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested.go @@ -0,0 +1,37 @@ +package main + +// Test: Deeply nested lambdas (3 levels) +func test4() { + x := source() + level1 := func() { + level2 := func() { + level3 := func() { + // ruleid: test-lambda-deeply-nested + sink(x) + } + level3() + } + level2() + } + level1() +} + +// Test: Deeply nested lambdas split across functions +func test4_level1(x string) { + level2 := func() { + level3 := func() { + // ruleid: test-lambda-deeply-nested + sink(x) + } + level3() + } + level2() +} + +func test4_caller() { + x := source() + test4_level1(x) +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested.yaml b/tests/rules/cross_function_tainting/test_lambda_deeply_nested.yaml new file mode 100644 index 000000000..7ab092a0c --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-deeply-nested + message: Tainted data flows to sink through deeply nested lambdas + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java new file mode 100644 index 000000000..da31a621d --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java @@ -0,0 +1,38 @@ +// Test: Deeply nested lambdas (3 levels) +class TestLambdaDeeplyNested { + + static void test4() { + String x = source(); + Runnable level1 = () -> { + Runnable level2 = () -> { + Runnable level3 = () -> { + // ruleid: test-lambda-deeply-nested-java + sink(x); + }; + level3(); + }; + level2(); + }; + level1(); + } + + // Test: Deeply nested lambdas split across functions + static void test4_level1(String x) { + Runnable level2 = () -> { + Runnable level3 = () -> { + // ruleid: test-lambda-deeply-nested-java + sink(x); + }; + level3(); + }; + level2(); + } + + static void test4_caller() { + String x = source(); + test4_level1(x); + } + + static String source() { return "tainted"; } + static void sink(String x) {} +} diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.yaml b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.yaml new file mode 100644 index 000000000..8a8e8526d --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-deeply-nested-java + message: Tainted data flows to sink through deeply nested lambdas + languages: + - java + severity: WARNING + mode: taint + pattern-sources: + - pattern: source(...) + pattern-sinks: + - pattern: sink(...) diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.js b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.js new file mode 100644 index 000000000..df5fe41f8 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.js @@ -0,0 +1,35 @@ +// Test: Deeply nested lambdas (3 levels) +function test4() { + let x = source(); + let level1 = () => { + let level2 = () => { + let level3 = () => { + // ruleid: test-lambda-deeply-nested-js + sink(x); + }; + level3(); + }; + level2(); + }; + level1(); +} + +// Test: Deeply nested lambdas split across functions +function test4_level1(x) { + let level2 = () => { + let level3 = () => { + // ruleid: test-lambda-deeply-nested-js + sink(x); + }; + level3(); + }; + level2(); +} + +function test4_caller() { + let x = source(); + test4_level1(x); +} + +function source() { return "tainted"; } +function sink(x) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.yaml b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.yaml new file mode 100644 index 000000000..8f30e926c --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_js.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-deeply-nested-js + message: Tainted data flows to sink through deeply nested lambdas + languages: + - javascript + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_php.php b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_php.php new file mode 100644 index 000000000..3a3fb6b96 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_php.php @@ -0,0 +1,37 @@ + String { String::from("tainted") } +fn sink(_s: &String) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_rust.yaml b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_rust.yaml new file mode 100644 index 000000000..51e71027a --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_rust.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-deeply-nested-rust + message: Tainted data flows to sink through deeply nested lambdas + languages: + - rust + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_nested_captured.go b/tests/rules/cross_function_tainting/test_lambda_nested_captured.go new file mode 100644 index 000000000..b6de184ca --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_nested_captured.go @@ -0,0 +1,16 @@ +package main + +// Test: Nested lambda capturing parent lambda's parameter +func test2() { + outer := func(a string) { + inner := func() { + // ruleid: test-lambda-nested-captured + sink(a) + } + inner() + } + outer(source()) +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_nested_captured.yaml b/tests/rules/cross_function_tainting/test_lambda_nested_captured.yaml new file mode 100644 index 000000000..907f95ee4 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_nested_captured.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-nested-captured + message: Tainted data flows to sink via captured variable from outer lambda + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_nested_param.go b/tests/rules/cross_function_tainting/test_lambda_nested_param.go new file mode 100644 index 000000000..4f428a8ed --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_nested_param.go @@ -0,0 +1,16 @@ +package main + +// Test: Nested lambda with param at each level +func test5() { + outer := func(a string) { + inner := func(b string) { + // ruleid: test-lambda-nested-param + sink(b) + } + inner(a) + } + outer(source()) +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_nested_param.yaml b/tests/rules/cross_function_tainting/test_lambda_nested_param.yaml new file mode 100644 index 000000000..8f88a8d58 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_nested_param.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-nested-param + message: Tainted data flows to sink through nested lambda parameters + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_no_taint.go b/tests/rules/cross_function_tainting/test_lambda_no_taint.go new file mode 100644 index 000000000..60c642065 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_no_taint.go @@ -0,0 +1,14 @@ +package main + +// Test: No taint - should have NO findings +func test6() { + x := "clean" + callback := func() { + // ok: test-lambda-no-taint + sink(x) + } + callback() +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_no_taint.yaml b/tests/rules/cross_function_tainting/test_lambda_no_taint.yaml new file mode 100644 index 000000000..7358c0472 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_no_taint.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-no-taint + message: Tainted data flows to sink + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_param_flow.go b/tests/rules/cross_function_tainting/test_lambda_param_flow.go new file mode 100644 index 000000000..fed615ee8 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_param_flow.go @@ -0,0 +1,13 @@ +package main + +// Test: Lambda parameter receives taint at call site +func test3() { + callback := func(x string) { + // ruleid: test-lambda-param-flow + sink(x) + } + callback(source()) +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_param_flow.yaml b/tests/rules/cross_function_tainting/test_lambda_param_flow.yaml new file mode 100644 index 000000000..7ca1ba90e --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_param_flow.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-param-flow + message: Tainted data flows to sink via lambda parameter + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_lambda_simple_captured.go b/tests/rules/cross_function_tainting/test_lambda_simple_captured.go new file mode 100644 index 000000000..9f0125f11 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_simple_captured.go @@ -0,0 +1,14 @@ +package main + +// Test: Simple lambda with captured variable +func test1() { + x := source() + callback := func() { + // ruleid: test-lambda-simple-captured + sink(x) + } + callback() +} + +func source() string { return "tainted" } +func sink(s string) {} diff --git a/tests/rules/cross_function_tainting/test_lambda_simple_captured.yaml b/tests/rules/cross_function_tainting/test_lambda_simple_captured.yaml new file mode 100644 index 000000000..b24076992 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_simple_captured.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-lambda-simple-captured + message: Tainted data flows to sink via captured variable in lambda + languages: + - go + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) From b3164457c394fc80fd90a38175ef2d4498804fea Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Mon, 13 Apr 2026 11:18:50 +0100 Subject: [PATCH 09/16] cleanup of AST_to_IL via the Disambiguate_ruby_calls Dissambiguating the the receiver of a DotAccess solves all the issues in AST_to_al --- src/analyzing/AST_to_IL.ml | 31 +------------------------- src/parsing/Disambiguate_ruby_calls.ml | 10 ++++++++- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/src/analyzing/AST_to_IL.ml b/src/analyzing/AST_to_IL.ml index 38d551aa3..6b4864b3b 100644 --- a/src/analyzing/AST_to_IL.ml +++ b/src/analyzing/AST_to_IL.ml @@ -873,19 +873,6 @@ and expr_aux env ?(void = false) g_expr : stmts * exp = (Tok.unbracket inner_args @ Tok.unbracket outer_args) in expr_aux env ~void (G.Call (callee, merged_args) |> G.e) - (* Ruby: when the callee is a plain identifier (G.N), evaluate it via - `lval` instead of `expr` to skip the `ident_function_call_hack` (see - the G.N arm below, ~line 892). That hack wraps bare identifiers in a - 0-arg Call for Ruby (where `foo` can mean `foo()`), but here we already - have an explicit G.Call — going through `expr` would produce a spurious - nested Call(Call(f, []), args) instead of Call(f, args). *) - | G.Call (({ G.e = G.N _; _ } as e), args) when env.lang =*= Lang.Ruby -> - let tok = G.fake "call" in - let ss_callee, callee_lval = lval env e in - let callee_exp = mk_e (Fetch callee_lval) (related_exp e) in - let ss_args, il_args = arguments env (Tok.unbracket args) in - let ss_call, call_exp = call_instr tok eorig ~void (fun res -> Call (res, callee_exp, il_args)) in - (ss_callee @ ss_args @ ss_call, call_exp) | G.Call (e, args) -> let tok = G.fake "call" in call_generic env ~void tok eorig e args @@ -902,23 +889,7 @@ and expr_aux env ?(void = false) g_expr : stmts * exp = | G.ArrayAccess (_, _) | G.DeRef (_, _) -> let ss_lv, lval = lval env g_expr in - let exp = mk_e (Fetch lval) eorig in - let ident_function_call_hack ss exp = - (* Taking into account Ruby's ability to allow function calls without - * parameters or parentheses, we are conducting a check to determine - * if a function with the same name as the identifier exists, specifically - * for Ruby. *) - match lval with - | { base = Var { ident; id_info; _ }; _ } - when env.lang =*= Lang.Ruby - && Option.is_none !(id_info.id_resolved) - && IdentSet.mem (H.str_of_ident ident) env.ctx.entity_names -> - let tok = G.fake "call" in - let call_ss, call_exp = call_instr tok eorig ~void (fun res -> Call (res, exp, [])) in - (ss @ call_ss, call_exp) - | _ -> (ss, exp) - in - ident_function_call_hack ss_lv exp + (ss_lv, mk_e (Fetch lval) eorig) (* x = ClassName(args ...) in Python *) (* ClassName has been resolved to __init__ by the pro engine. *) (* Identified and treated as x = New ClassName(args ...) to support diff --git a/src/parsing/Disambiguate_ruby_calls.ml b/src/parsing/Disambiguate_ruby_calls.ml index 8f40031d1..7525f6cff 100644 --- a/src/parsing/Disambiguate_ruby_calls.ml +++ b/src/parsing/Disambiguate_ruby_calls.ml @@ -23,8 +23,16 @@ class ['self] visitor = method! visit_expr_kind env ek = match ek with - (* Do not recurse into the callee of a Call -- only visit arguments. *) + (* Do not recurse into the direct callee of a Call, but DO visit + the receiver of a DotAccess callee — in `helper.process()`, + `helper` may be an unresolved method call that needs wrapping. *) | Call (callee, args) -> + let callee = match callee.e with + | DotAccess (receiver, tok, field) -> + let receiver = self#visit_expr env receiver in + { callee with e = DotAccess (receiver, tok, field) } + | _ -> callee + in let args = self#visit_arguments env args in Call (callee, args) (* Bare unresolved lowercase identifier -- wrap in a zero-arg Call. *) From 489f30be42f93edd96c1d8a5c5682693221d2993 Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Mon, 13 Apr 2026 11:37:55 +0100 Subject: [PATCH 10/16] added test to show the Dissambiguation requirement --- .../test_ruby_chained_method.rb | 20 +++++++++++++++++++ .../test_ruby_chained_method.yaml | 13 ++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/rules/cross_function_tainting/test_ruby_chained_method.rb create mode 100644 tests/rules/cross_function_tainting/test_ruby_chained_method.yaml diff --git a/tests/rules/cross_function_tainting/test_ruby_chained_method.rb b/tests/rules/cross_function_tainting/test_ruby_chained_method.rb new file mode 100644 index 000000000..8ca291a22 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_ruby_chained_method.rb @@ -0,0 +1,20 @@ +# Test that taint flows through chained method calls where the receiver +# is itself a method call: get_data.strip should call get_data() first. + +class Controller + def show + # ruleid: test-ruby-chained-method + sink(get_data.strip) + end + + def get_data + source() + end +end + +def source() + "tainted" +end + +def sink(x) +end diff --git a/tests/rules/cross_function_tainting/test_ruby_chained_method.yaml b/tests/rules/cross_function_tainting/test_ruby_chained_method.yaml new file mode 100644 index 000000000..56b1e8436 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_ruby_chained_method.yaml @@ -0,0 +1,13 @@ +rules: + - id: test-ruby-chained-method + message: taint through chained method call + languages: + - ruby + severity: WARNING + mode: taint + options: + taint_intrafile: true + pattern-sources: + - pattern: source(...) + pattern-sinks: + - pattern: sink(...) From 45e2a7fd2fe97006bc2395ac735ea122562e1c7b Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Mon, 13 Apr 2026 14:08:39 +0100 Subject: [PATCH 11/16] Removed ctx from AST_to_IL, generalised the disambiguation of called for ruby --- src/analyzing/AST_to_IL.ml | 13 ++--------- src/analyzing/AST_to_IL.mli | 6 ----- src/analyzing/CFG_build.ml | 4 ++-- src/analyzing/CFG_build.mli | 2 +- src/engine/Match_tainting_mode.ml | 26 ++++++---------------- src/engine/Match_tainting_mode.mli | 1 - src/engine/tests/Test_dataflow_tainting.ml | 2 +- src/parsing/Disambiguate_ruby_calls.ml | 18 +++++++-------- src/tainting/Dataflow_tainting.ml | 2 +- src/tainting/Graph_from_AST.ml | 3 ++- 10 files changed, 25 insertions(+), 52 deletions(-) diff --git a/src/analyzing/AST_to_IL.ml b/src/analyzing/AST_to_IL.ml index 6b4864b3b..830588c52 100644 --- a/src/analyzing/AST_to_IL.ml +++ b/src/analyzing/AST_to_IL.ml @@ -45,9 +45,6 @@ let log_error ?tok msg : unit = Log.err (fun m -> m "%s" (locate ?tok msg)) (*****************************************************************************) (* Types *) (*****************************************************************************) -module IdentSet = Set.Make (String) - -type ctx = { entity_names : IdentSet.t } type stmts = stmt list type rec_point_lvals = @@ -63,18 +60,14 @@ type env = { break_labels : label list; cont_label : label option; rec_point_label : label option; - ctx : ctx; rec_point_lvals : rec_point_lvals option; inside_function : bool; } -let empty_ctx : ctx = { entity_names = IdentSet.empty } - let empty_env (lang : Lang.t) : env = { break_labels = []; cont_label = None; rec_point_label = None; - ctx = empty_ctx; rec_point_lvals = None; inside_function = false; lang } @@ -257,8 +250,6 @@ let mk_class_constructor_name (ty : G.type_) cons_id_info : G.name option = Some (G.Id (id, cons_id_info)) | __else__ -> None -let add_entity_name ctx ident : ctx = - { entity_names = IdentSet.add (H.str_of_ident ident) ctx.entity_names } let def_expr_evaluates_to_value (lang : Lang.t) : bool = match lang with @@ -2614,8 +2605,8 @@ and function_definition env fdef : function_definition = (* Entry points *) (****************************************************************************) -let function_definition lang ?ctx fdef : function_definition = - let env = { (empty_env lang) with ctx = ctx ||| empty_ctx } in +let function_definition lang fdef : function_definition = + let env = empty_env lang in function_definition env fdef let stmt lang st : stmts = diff --git a/src/analyzing/AST_to_IL.mli b/src/analyzing/AST_to_IL.mli index 3477658d2..3e6ce6a22 100644 --- a/src/analyzing/AST_to_IL.mli +++ b/src/analyzing/AST_to_IL.mli @@ -1,11 +1,5 @@ -type ctx - -val empty_ctx : ctx -val add_entity_name : ctx -> AST_generic.ident -> ctx - val function_definition : Lang.t -> - ?ctx:ctx -> AST_generic.function_definition -> IL.function_definition diff --git a/src/analyzing/CFG_build.ml b/src/analyzing/CFG_build.ml index 3d61c368d..0cb6282e9 100644 --- a/src/analyzing/CFG_build.ml +++ b/src/analyzing/CFG_build.ml @@ -456,6 +456,6 @@ and cfg_of_fdef fdef = mark_at_exit_nodes cfg; IL.{ params = fdef.fparams; cfg; lambdas } -let cfg_of_gfdef lang ?ctx fdef = - let fdef_il = AST_to_IL.function_definition lang ?ctx fdef in +let cfg_of_gfdef lang fdef = + let fdef_il = AST_to_IL.function_definition lang fdef in cfg_of_fdef fdef_il diff --git a/src/analyzing/CFG_build.mli b/src/analyzing/CFG_build.mli index 4305da9af..4d56b6361 100644 --- a/src/analyzing/CFG_build.mli +++ b/src/analyzing/CFG_build.mli @@ -9,5 +9,5 @@ val cfg_of_fdef : IL.function_definition -> IL.fun_cfg (** Compute the control flow graph of an IL function definition. *) val cfg_of_gfdef : - Lang.t -> ?ctx:AST_to_IL.ctx -> AST_generic.function_definition -> IL.fun_cfg + Lang.t -> AST_generic.function_definition -> IL.fun_cfg (** Same as 'cfg_of_fdef' but takes a Generic function definition. *) diff --git a/src/engine/Match_tainting_mode.ml b/src/engine/Match_tainting_mode.ml index c33104e10..95a2c564a 100644 --- a/src/engine/Match_tainting_mode.ml +++ b/src/engine/Match_tainting_mode.ml @@ -241,9 +241,9 @@ let pms_of_effect ~match_on (effect_ : Effect.t) = (* Main entry points *) (*****************************************************************************) -let check_fundef (taint_inst : Taint_rule_inst.t) (name : IL.name) ctx ?glob_env ?class_name +let check_fundef (taint_inst : Taint_rule_inst.t) (name : IL.name) ?glob_env ?class_name ?signature_db ?builtin_signature_db ?call_graph fdef = - let fdef = AST_to_IL.function_definition taint_inst.lang ~ctx fdef in + let fdef = AST_to_IL.function_definition taint_inst.lang fdef in let fcfg = CFG_build.cfg_of_fdef fdef in let in_env, env_effects = Taint_input_env.mk_fun_input_env taint_inst ?glob_env fdef.fparams @@ -428,16 +428,6 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook with | None -> (None, None) | Some (taint_inst, spec_matches, expls) -> - (* FIXME: This is no longer needed, now we can just check the type 'n'. *) - let ctx = ref AST_to_IL.empty_ctx in - Visit_function_defs.visit - (fun opt_ent _fdef -> - match opt_ent with - | Some { name = EN (Id (n, _)); _ } -> - ctx := AST_to_IL.add_entity_name !ctx n - | __else__ -> ()) - ast; - let glob_env, glob_effects = Taint_input_env.mk_file_env taint_inst ast in record_matches glob_effects; @@ -483,7 +473,7 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook in let fdef_il = AST_to_IL.function_definition taint_inst.lang - ~ctx:!ctx fdef + fdef in let cfg = CFG_build.cfg_of_fdef fdef_il in let info = @@ -530,8 +520,7 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook [] in let fdef_il = - AST_to_IL.function_definition taint_inst.lang ~ctx:!ctx - fdef + AST_to_IL.function_definition taint_inst.lang fdef in let cfg = CFG_build.cfg_of_fdef fdef_il in let info = @@ -636,7 +625,7 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook if info.is_lambda_assignment then updated_db else begin let _flow, fdef_effects, _mapping = - check_fundef taint_inst info.name !ctx ~glob_env + check_fundef taint_inst info.name ~glob_env ?class_name:info.class_name_str ~signature_db:updated_db ?builtin_signature_db ?call_graph:(Some relevant_graph) info.fdef @@ -662,8 +651,7 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook } in let fdef_il = - AST_to_IL.function_definition lang ~ctx:!ctx - synthetic_fdef + AST_to_IL.function_definition lang synthetic_fdef in let cfg = CFG_build.cfg_of_fdef fdef_il in let db', _sig = @@ -776,7 +764,7 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook --------------------" (IL.str_of_name name)); let _flow, fdef_effects, _mapping = - check_fundef taint_inst name !ctx ~glob_env + check_fundef taint_inst name ~glob_env ?builtin_signature_db fdef in record_matches fdef_effects) diff --git a/src/engine/Match_tainting_mode.mli b/src/engine/Match_tainting_mode.mli index b2869655c..7902db649 100644 --- a/src/engine/Match_tainting_mode.mli +++ b/src/engine/Match_tainting_mode.mli @@ -17,7 +17,6 @@ val hook_setup_hook_function_taint_signature : val check_fundef : Taint_rule_inst.t -> IL.name (** entity being analyzed *) -> - AST_to_IL.ctx -> ?glob_env:Taint_lval_env.t -> ?class_name:string -> ?signature_db:Shape_and_sig.signature_database -> diff --git a/src/engine/tests/Test_dataflow_tainting.ml b/src/engine/tests/Test_dataflow_tainting.ml index 240f264aa..bc0586cd7 100644 --- a/src/engine/tests/Test_dataflow_tainting.ml +++ b/src/engine/tests/Test_dataflow_tainting.ml @@ -32,7 +32,7 @@ let test_tainting taint_inst def = let fcfg, _effects_IGNORED, mapping = Match_tainting_mode.check_fundef taint_inst test_name - AST_to_IL.empty_ctx def + def in DataflowX.display_mapping fcfg.cfg mapping Taint_lval_env.to_string diff --git a/src/parsing/Disambiguate_ruby_calls.ml b/src/parsing/Disambiguate_ruby_calls.ml index 7525f6cff..f0c190a2f 100644 --- a/src/parsing/Disambiguate_ruby_calls.ml +++ b/src/parsing/Disambiguate_ruby_calls.ml @@ -23,16 +23,16 @@ class ['self] visitor = method! visit_expr_kind env ek = match ek with - (* Do not recurse into the direct callee of a Call, but DO visit - the receiver of a DotAccess callee — in `helper.process()`, - `helper` may be an unresolved method call that needs wrapping. *) + (* Visit the callee of a Call unless it is a bare N(Id(...)) — + visiting that would wrap it in another Call, producing a spurious + Call(Call(f, []), args). For compound callees (DotAccess, + ArrayAccess, etc.) we DO recurse so that nested bare identifiers + like `helper` in `helper.process()` get properly wrapped. *) + | Call ({ e = N (Id _); _ } as callee, args) -> + let args = self#visit_arguments env args in + Call (callee, args) | Call (callee, args) -> - let callee = match callee.e with - | DotAccess (receiver, tok, field) -> - let receiver = self#visit_expr env receiver in - { callee with e = DotAccess (receiver, tok, field) } - | _ -> callee - in + let callee = self#visit_expr env callee in let args = self#visit_arguments env args in Call (callee, args) (* Bare unresolved lowercase identifier -- wrap in a zero-arg Call. *) diff --git a/src/tainting/Dataflow_tainting.ml b/src/tainting/Dataflow_tainting.ml index 3d112f813..6bbf6010f 100644 --- a/src/tainting/Dataflow_tainting.ml +++ b/src/tainting/Dataflow_tainting.ml @@ -1547,7 +1547,7 @@ let check_function_call env fun_exp args * This handles the case where a lambda is assigned to a variable like: * callback := func(x) { sink(x) } * callback(source()) - * The signature is stored under the lambda's internal name (_tmp:N), + * The signature is stored under the lambda's internal name (_tmp_lambda:N), * but the variable 'callback' has the Fun shape from the assignment. *) (match fun_exp.e with | Fetch lval -> diff --git a/src/tainting/Graph_from_AST.ml b/src/tainting/Graph_from_AST.ml index 86f62e833..b73021446 100644 --- a/src/tainting/Graph_from_AST.ml +++ b/src/tainting/Graph_from_AST.ml @@ -702,7 +702,8 @@ let extract_hof_callbacks ?(_object_mappings = []) ?(all_funcs = []) (match e.G.e with (* Ruby/Scala block pattern: f(args) { block } is Call(Call(callee, inner_args), [block]). Merge inner_args and block args so the HOF detection sees all arguments together. *) - | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) -> + | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) + when Lang.(lang =*= Ruby || lang =*= Scala) -> let merged_args = Tok.unsafe_fake_bracket (Tok.unbracket inner_args @ Tok.unbracket outer_args) in let found = extract_hof_callbacks_from_call From c440d540c5bde20d11caa28c7024a121b74a4fcc Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Wed, 15 Apr 2026 12:19:44 +0100 Subject: [PATCH 12/16] ast-to-il: simplify ruby do-block flattening --- src/analyzing/AST_to_IL.ml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/analyzing/AST_to_IL.ml b/src/analyzing/AST_to_IL.ml index 830588c52..2de6d511a 100644 --- a/src/analyzing/AST_to_IL.ml +++ b/src/analyzing/AST_to_IL.ml @@ -852,16 +852,12 @@ and expr_aux env ?(void = false) g_expr : stmts * exp = (* Ruby do-block flattening: `f(args) do |x| ... end` is parsed as Call(Call(f, args), [Lambda]) but the block is semantically an argument to f, not to its return value. Flatten into Call(f, args @ [Lambda]). *) - | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) - when env.lang =*= Lang.Ruby - && List.exists - (function - | G.Arg { G.e = G.Lambda _; _ } -> true - | _ -> false) - (Tok.unbracket outer_args) -> + | G.Call ({ e = G.Call (callee, inner_args); _ }, + (_, ([ G.Arg { G.e = G.Lambda _; _ } ] as outer_arg), _ )) + when env.lang =*= Lang.Ruby -> let merged_args = Tok.unsafe_fake_bracket - (Tok.unbracket inner_args @ Tok.unbracket outer_args) + (Tok.unbracket inner_args @ outer_arg) in expr_aux env ~void (G.Call (callee, merged_args) |> G.e) | G.Call (e, args) -> From c29a8f062956db6bddb354de7dbf41fc64abfacd Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Wed, 15 Apr 2026 12:47:16 +0100 Subject: [PATCH 13/16] graph-from-ast: simplify ruby/scala do-block flattening --- src/tainting/Graph_from_AST.ml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tainting/Graph_from_AST.ml b/src/tainting/Graph_from_AST.ml index b73021446..f35726594 100644 --- a/src/tainting/Graph_from_AST.ml +++ b/src/tainting/Graph_from_AST.ml @@ -702,10 +702,11 @@ let extract_hof_callbacks ?(_object_mappings = []) ?(all_funcs = []) (match e.G.e with (* Ruby/Scala block pattern: f(args) { block } is Call(Call(callee, inner_args), [block]). Merge inner_args and block args so the HOF detection sees all arguments together. *) - | G.Call ({ e = G.Call (callee, inner_args); _ }, outer_args) + | G.Call ({ e = G.Call (callee, inner_args); _ }, + (_, ([ G.Arg { G.e = G.Lambda _; _ } ] as outer_arg), _)) when Lang.(lang =*= Ruby || lang =*= Scala) -> let merged_args = Tok.unsafe_fake_bracket - (Tok.unbracket inner_args @ Tok.unbracket outer_args) in + (Tok.unbracket inner_args @ outer_arg) in let found = extract_hof_callbacks_from_call ~method_hofs ~function_hofs ~all_funcs ~caller_parent_path callee merged_args From b53f69d03a1c810adac600e27d359c47c0ecdeb0 Mon Sep 17 00:00:00 2001 From: Dimitris Mostrous Date: Wed, 15 Apr 2026 14:17:43 +0100 Subject: [PATCH 14/16] taint: support functional interface invoke methods (.run, .call, .apply, etc.) Add invoke_methods to Lang_config for languages where lambdas are invoked via named methods on functional interfaces (Java, Kotlin, C#, Ruby). The call graph and taint signature lookup now recognise these as lambda invocations so taint flows through them correctly. --- src/tainting/Dataflow_tainting.ml | 36 +++++----- src/tainting/Graph_from_AST.ml | 21 +++++- src/tainting/Lang_config.ml | 25 +++++++ .../test_invoke_methods_csharp.cs | 31 ++++++++ .../test_invoke_methods_csharp.yaml | 11 +++ .../test_invoke_methods_java.java | 72 +++++++++++++++++++ .../test_invoke_methods_java.yaml | 11 +++ .../test_invoke_methods_kotlin.kt | 28 ++++++++ .../test_invoke_methods_kotlin.yaml | 11 +++ .../test_invoke_methods_ruby.rb | 32 +++++++++ .../test_invoke_methods_ruby.yaml | 11 +++ .../test_lambda_deeply_nested_java.java | 10 +-- 12 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_csharp.cs create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_csharp.yaml create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_java.java create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_java.yaml create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_kotlin.kt create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_kotlin.yaml create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_ruby.rb create mode 100644 tests/rules/cross_function_tainting/test_invoke_methods_ruby.yaml diff --git a/src/tainting/Dataflow_tainting.ml b/src/tainting/Dataflow_tainting.ml index 6bbf6010f..ac9a1f5f7 100644 --- a/src/tainting/Dataflow_tainting.ml +++ b/src/tainting/Dataflow_tainting.ml @@ -1544,14 +1544,22 @@ let check_function_call env fun_exp args | Some _ -> from_db | None -> (* lookup_signature failed - check if callee has a Fun shape in lval_env. - * This handles the case where a lambda is assigned to a variable like: - * callback := func(x) { sink(x) } - * callback(source()) - * The signature is stored under the lambda's internal name (_tmp_lambda:N), - * but the variable 'callback' has the Fun shape from the assignment. *) + * This handles two cases: + * callback(source()) -- direct call, lval = callback + * callback.run(source()) -- invoke method, lval = callback.run + * For invoke methods (e.g. Java Runnable.run), strip the method offset + * and look up the base variable. *) (match fun_exp.e with | Fetch lval -> - (match Lval_env.find_lval env.lval_env lval with + let lval_to_check = + let invoke_methods = (Lang_config.get env.taint_inst.lang).invoke_methods in + match lval.rev_offset with + | [{ o = Dot method_name; _ }] + when List.mem (fst method_name.ident) invoke_methods -> + { lval with rev_offset = [] } + | _ -> lval + in + (match Lval_env.find_lval env.lval_env lval_to_check with | Some (S.Cell (_, S.Fun fun_sig)) -> Log.debug (fun m -> m "SIG_FROM_SHAPE: Found Fun shape for %s" @@ -2123,16 +2131,12 @@ let call_with_intrafile lval_opt e env args instr = * In this case we return empty taints - the callback's return will be handled * when the ToSinkInCall effect is instantiated. *) let is_method_callback_invoke = - (* Check if this is a method call pattern on a callback parameter *) - match env.taint_inst.lang, e_obj, e.e with - | Lang.Java, `Obj (_, S.Arg _), Fetch { rev_offset = { o = Dot name; _ } :: _; _ } -> - (* Java Function.apply or similar callback invocation methods *) - let method_name = fst name.ident in - method_name = "apply" || method_name = "accept" || method_name = "test" || method_name = "get" - | Lang.Ruby, `Obj (_, S.Arg _), Fetch { rev_offset = { o = Dot name; _ } :: _; _ } -> - (* Ruby proc/lambda.call invocation *) - let method_name = fst name.ident in - method_name = "call" + (* Check if this is a method call on a callback parameter + * via a configured invoke method (e.g. .apply, .call, .run). *) + match e_obj, e.e with + | `Obj (_, S.Arg _), Fetch { rev_offset = { o = Dot name; _ } :: _; _ } -> + let invoke_methods = (Lang_config.get env.taint_inst.lang).invoke_methods in + List.mem (fst name.ident) invoke_methods | _ -> false in let callee_is_callback = diff --git a/src/tainting/Graph_from_AST.ml b/src/tainting/Graph_from_AST.ml index f35726594..904974037 100644 --- a/src/tainting/Graph_from_AST.ml +++ b/src/tainting/Graph_from_AST.ml @@ -437,7 +437,26 @@ let extract_calls ~(lang : Lang.t) ?(object_mappings = []) ?(all_funcs = []) ?(c | [] -> Tok.unsafe_fake_tok "") in calls := (fn_id, tok) :: !calls - | None -> ()); + | None -> + (* Invoke-method pattern: var.run() where var is a lambda. + If the method name is a configured invoke method, look for + a lambda with the receiver's name in the current scope. *) + let invoke_methods = (Lang_config.get lang).invoke_methods in + (match callee.G.e with + | G.DotAccess ({ e = G.N (G.Id ((var_name, _), _)); _ }, _, + G.FN (G.Id ((method_name, method_tok), _))) + when List.mem method_name invoke_methods -> + let lambda_match = List.find_opt (fun (f : func_info) -> + match List_.init_and_last_opt f.fn_id with + | Some (f_parent, Some name) + when String.equal (fst name.IL.ident) var_name -> + equal_with_pos f_parent caller_parent_path + | _ -> false + ) all_funcs in + (match lambda_match with + | Some f -> calls := (f.fn_id, method_tok) :: !calls + | None -> ()) + | _ -> ())); (* Check arguments for unresolved function calls (Ruby-style) *) List.iter check_arg_for_unresolved_function_call args_list; (* Visit callee expression for nested calls (e.g., Ruby's File.open(path_for(x)) do ... end diff --git a/src/tainting/Lang_config.ml b/src/tainting/Lang_config.ml index b26ac091b..32b927c85 100644 --- a/src/tainting/Lang_config.ml +++ b/src/tainting/Lang_config.ml @@ -51,6 +51,10 @@ type t = { collection_configs : collection_model_kind list; constructor_names : string list; uses_new_keyword : bool; + (* Methods that invoke `self` as a function. E.g. Runnable.run() in Java, + Proc#call in Ruby. When a variable with Fun shape is the receiver of one + of these methods, the call is treated as a direct lambda invocation. *) + invoke_methods : string list; } (* ========================================================================== *) @@ -62,6 +66,7 @@ let empty = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let python = { @@ -84,6 +89,7 @@ let python = { ]; constructor_names = ["__init__"]; uses_new_keyword = false; + invoke_methods = []; } let ruby = { @@ -111,6 +117,7 @@ let ruby = { ]; constructor_names = ["initialize"]; uses_new_keyword = false; + invoke_methods = ["call"]; } let javascript = { @@ -139,6 +146,7 @@ let javascript = { ]; constructor_names = ["constructor"]; uses_new_keyword = true; + invoke_methods = []; } let typescript = { @@ -171,6 +179,7 @@ let java = { ]; constructor_names = [""]; uses_new_keyword = true; + invoke_methods = ["run"; "call"; "apply"; "accept"; "invoke"]; } let kotlin = { @@ -201,6 +210,7 @@ let kotlin = { ]; constructor_names = [""; "init"; "constructor"]; uses_new_keyword = false; + invoke_methods = ["invoke"]; } let scala = { @@ -223,6 +233,7 @@ let scala = { ]; constructor_names = [""]; uses_new_keyword = false; + invoke_methods = []; } let csharp = { @@ -247,6 +258,7 @@ let csharp = { ]; constructor_names = [".ctor"]; uses_new_keyword = true; + invoke_methods = ["Invoke"]; } let go = { @@ -258,6 +270,7 @@ let go = { ]; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let rust = { @@ -280,6 +293,7 @@ let rust = { ]; constructor_names = ["new"]; uses_new_keyword = false; + invoke_methods = []; } let swift = { @@ -303,6 +317,7 @@ let swift = { ]; constructor_names = ["init"]; uses_new_keyword = false; + invoke_methods = []; } let php = { @@ -313,6 +328,7 @@ let php = { collection_configs = []; (* PHP collections are mostly handled via builtin functions *) constructor_names = ["__construct"]; uses_new_keyword = true; + invoke_methods = []; } let cpp = { @@ -323,6 +339,7 @@ let cpp = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let c = { @@ -335,6 +352,7 @@ let ocaml_lang = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let lua = { @@ -342,6 +360,7 @@ let lua = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let dart = { @@ -349,6 +368,7 @@ let dart = { collection_configs = []; constructor_names = ["constructor"]; uses_new_keyword = false; + invoke_methods = []; } let elixir = { @@ -364,6 +384,7 @@ let elixir = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let julia = { @@ -373,6 +394,7 @@ let julia = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let clojure = { @@ -394,6 +416,7 @@ let clojure = { collection_configs = []; constructor_names = []; uses_new_keyword = false; + invoke_methods = []; } let apex = { @@ -401,6 +424,7 @@ let apex = { collection_configs = []; constructor_names = [""]; uses_new_keyword = true; + invoke_methods = []; } let vb = { @@ -408,6 +432,7 @@ let vb = { collection_configs = []; constructor_names = ["New"]; uses_new_keyword = true; + invoke_methods = []; } (* ========================================================================== *) diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_csharp.cs b/tests/rules/cross_function_tainting/test_invoke_methods_csharp.cs new file mode 100644 index 000000000..2aa85eb0a --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_csharp.cs @@ -0,0 +1,31 @@ +// C#: nested lambdas invoked via .Invoke() +class TestInvokeMethods { + + static void test_invoke() { + var x = source(); + Action outer = () => { + Action inner = () => { + // ruleid: test-invoke-methods-csharp + sink(x); + }; + inner.Invoke(); + }; + outer.Invoke(); + } + + // Negative: no taint + static void test_no_taint() { + var x = "clean"; + Action outer = () => { + Action inner = () => { + // ok: test-invoke-methods-csharp + sink(x); + }; + inner.Invoke(); + }; + outer.Invoke(); + } + + static string source() { return "tainted"; } + static void sink(string x) {} +} diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_csharp.yaml b/tests/rules/cross_function_tainting/test_invoke_methods_csharp.yaml new file mode 100644 index 000000000..da83a39a0 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_csharp.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-invoke-methods-csharp + message: Taint flows through lambda invoked via .Invoke() + languages: + - csharp + severity: WARNING + mode: taint + pattern-sources: + - pattern: source(...) + pattern-sinks: + - pattern: sink(...) diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_java.java b/tests/rules/cross_function_tainting/test_invoke_methods_java.java new file mode 100644 index 000000000..a0646b5b4 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_java.java @@ -0,0 +1,72 @@ +class TestInvokeMethods { + + // Function.apply: nested lambdas invoked via apply + static void test_apply() { + String x = source(); + Function outer = (a) -> { + Function inner = (b) -> { + // ruleid: test-invoke-methods-java + sink(b); + return b; + }; + return inner.apply(a); + }; + outer.apply(x); + } + + // Consumer.accept: nested lambdas invoked via accept + static void test_accept() { + String x = source(); + Consumer outer = (a) -> { + Consumer inner = (b) -> { + // ruleid: test-invoke-methods-java + sink(b); + }; + inner.accept(a); + }; + outer.accept(x); + } + + // Runnable.run: nested lambdas capturing tainted variable + static void test_run() { + String x = source(); + Runnable outer = () -> { + Runnable inner = () -> { + // ruleid: test-invoke-methods-java + sink(x); + }; + inner.run(); + }; + outer.run(); + } + + // Callable.call: nested lambdas capturing tainted variable + static void test_call() { + String x = source(); + Callable outer = () -> { + Callable inner = () -> { + // ruleid: test-invoke-methods-java + sink(x); + return x; + }; + return inner.call(); + }; + outer.call(); + } + + // Negative: no taint source, nested + static void test_no_taint() { + String x = "clean"; + Runnable outer = () -> { + Runnable inner = () -> { + // ok: test-invoke-methods-java + sink(x); + }; + inner.run(); + }; + outer.run(); + } + + static String source() { return "tainted"; } + static void sink(String x) {} +} diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_java.yaml b/tests/rules/cross_function_tainting/test_invoke_methods_java.yaml new file mode 100644 index 000000000..e5e81a6ec --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_java.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-invoke-methods-java + message: Taint flows through lambda invoked via functional interface method + languages: + - java + severity: WARNING + mode: taint + pattern-sources: + - pattern: source(...) + pattern-sinks: + - pattern: sink(...) diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.kt b/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.kt new file mode 100644 index 000000000..3668a55e8 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.kt @@ -0,0 +1,28 @@ +// Kotlin: nested lambdas invoked via .invoke() +fun test_invoke() { + val x = source() + val outer: () -> Unit = { + val inner: () -> Unit = { + // ruleid: test-invoke-methods-kotlin + sink(x) + } + inner.invoke() + } + outer.invoke() +} + +// Negative: no taint +fun test_no_taint() { + val x = "clean" + val outer: () -> Unit = { + val inner: () -> Unit = { + // ok: test-invoke-methods-kotlin + sink(x) + } + inner.invoke() + } + outer.invoke() +} + +fun source(): String = "tainted" +fun sink(x: String) {} diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.yaml b/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.yaml new file mode 100644 index 000000000..498f0d30d --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_kotlin.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-invoke-methods-kotlin + message: Taint flows through lambda invoked via .invoke() + languages: + - kotlin + severity: WARNING + mode: taint + pattern-sources: + - pattern: source() + pattern-sinks: + - pattern: sink($X) diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_ruby.rb b/tests/rules/cross_function_tainting/test_invoke_methods_ruby.rb new file mode 100644 index 000000000..cc0c051a4 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_ruby.rb @@ -0,0 +1,32 @@ +# Ruby: nested lambdas invoked via .call() +def test_call() + x = source() + outer = ->(a) { + inner = ->(b) { + # ruleid: test-invoke-methods-ruby + sink(b) + } + inner.call(a) + } + outer.call(x) +end + +# Negative: no taint +def test_no_taint() + x = "clean" + outer = ->() { + inner = ->() { + # ok: test-invoke-methods-ruby + sink(x) + } + inner.call() + } + outer.call() +end + +def source() + "tainted" +end + +def sink(x) +end diff --git a/tests/rules/cross_function_tainting/test_invoke_methods_ruby.yaml b/tests/rules/cross_function_tainting/test_invoke_methods_ruby.yaml new file mode 100644 index 000000000..396ac8f67 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_invoke_methods_ruby.yaml @@ -0,0 +1,11 @@ +rules: + - id: test-invoke-methods-ruby + message: Taint flows through lambda invoked via .call() + languages: + - ruby + severity: WARNING + mode: taint + pattern-sources: + - pattern: source(...) + pattern-sinks: + - pattern: sink(...) diff --git a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java index da31a621d..c5d733cab 100644 --- a/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java +++ b/tests/rules/cross_function_tainting/test_lambda_deeply_nested_java.java @@ -9,11 +9,11 @@ static void test4() { // ruleid: test-lambda-deeply-nested-java sink(x); }; - level3(); + level3.run(); }; - level2(); + level2.run(); }; - level1(); + level1.run(); } // Test: Deeply nested lambdas split across functions @@ -23,9 +23,9 @@ static void test4_level1(String x) { // ruleid: test-lambda-deeply-nested-java sink(x); }; - level3(); + level3.run(); }; - level2(); + level2.run(); } static void test4_caller() { From a0ad2439c33d2454be65f93a35ede19157596cb3 Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Thu, 16 Apr 2026 10:24:11 +0100 Subject: [PATCH 15/16] added lambda taint as well as test, modified the same name test run_check_fundef_if_needed used to skip check_fundef for is_lambda_assignment entries, so local source-to-sink flows inside lambdas (e.g. record-field lambdas like `{ h: function(d){ sink(d); } }`) were only ever stored in the signature and lost when no call site resolved. Changes: - Match_tainting_mode.ml: always run check_fundef; for lambda assignments filter effects to ToSink with Src-origin taint and PBool true precondition. BArg/other parameterised taints still ride the signature unchanged. - test_lambda_in_object_literal.{yaml,js}: new coverage for the record-field case. - test_same_name_functions.go: reworked to exercise only same-name confusion (second lambda now uses safe(s), both fns are called); still fails pre-#617. --- src/engine/Match_tainting_mode.ml | 57 +++++++++++++++---- .../test_lambda_in_object_literal.js | 15 +++++ .../test_lambda_in_object_literal.yaml | 13 +++++ .../test_same_name_functions.go | 3 +- 4 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 tests/rules/cross_function_tainting/test_lambda_in_object_literal.js create mode 100644 tests/rules/cross_function_tainting/test_lambda_in_object_literal.yaml diff --git a/src/engine/Match_tainting_mode.ml b/src/engine/Match_tainting_mode.ml index 95a2c564a..95c06ab4c 100644 --- a/src/engine/Match_tainting_mode.ml +++ b/src/engine/Match_tainting_mode.ml @@ -622,17 +622,52 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook let run_check_fundef_if_needed (info : fun_info) (updated_db : Shape_and_sig.signature_database) : Shape_and_sig.signature_database = - if info.is_lambda_assignment then updated_db - else begin - let _flow, fdef_effects, _mapping = - check_fundef taint_inst info.name ~glob_env - ?class_name:info.class_name_str ~signature_db:updated_db - ?builtin_signature_db - ?call_graph:(Some relevant_graph) info.fdef - in - record_matches fdef_effects; - updated_db - end + let _flow, fdef_effects, _mapping = + check_fundef taint_inst info.name ~glob_env + ?class_name:info.class_name_str ~signature_db:updated_db + ?builtin_signature_db + ?call_graph:(Some relevant_graph) info.fdef + in + (* For lambda assignments we only record "unconditional" ToSink + effects — those where the taint at the sink comes from a + concrete pattern-source match (e.g. a parameter declared as a + source via `pattern-inside: function $X(..., $RES, ...) {...}`). + Effects whose taint is purely parameterized (BArg) still ride + through the signature at resolved call sites; effects mixing + both get an Src-only slice surfaced here. *) + let effects_to_record = + if info.is_lambda_assignment then + fdef_effects + |> Effects.elements + |> List.filter_map (fun eff -> + match eff with + | Effect.ToSink sink_info -> + let items, _ = + sink_info.taints_with_precondition + in + let src_items = + List.filter + (fun (item : Effect.taint_to_sink_item) -> + match item.taint.orig with + | Taint.Src _ -> true + | _ -> false) + items + in + if List_.null src_items then None + else + Some + (Effect.ToSink + { + sink_info with + taints_with_precondition = + (src_items, Rule.PBool true); + }) + | _ -> None) + |> Effects.of_list + else fdef_effects + in + record_matches effects_to_record; + updated_db in let process_fun_info info db = diff --git a/tests/rules/cross_function_tainting/test_lambda_in_object_literal.js b/tests/rules/cross_function_tainting/test_lambda_in_object_literal.js new file mode 100644 index 000000000..9268b90b0 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_in_object_literal.js @@ -0,0 +1,15 @@ +// Lambda assigned as a property of an object literal, then called via +// unresolved property access. The parameter `data` matches the source +// pattern (concrete source), so `sink(data)` should fire regardless of +// whether the call graph can resolve `x.success(a)` back to the lambda. + +function test1(a) { + var x = { + url: '/api/settings', + success: function(data) { + // ruleid: taint-func-param + sink(data); + } + }; + x.success(a); +} diff --git a/tests/rules/cross_function_tainting/test_lambda_in_object_literal.yaml b/tests/rules/cross_function_tainting/test_lambda_in_object_literal.yaml new file mode 100644 index 000000000..e88144206 --- /dev/null +++ b/tests/rules/cross_function_tainting/test_lambda_in_object_literal.yaml @@ -0,0 +1,13 @@ +rules: + - id: taint-func-param + message: Tainted parameter reaches sink + languages: [javascript] + severity: ERROR + mode: taint + pattern-sources: + - patterns: + - pattern-inside: | + function $X(..., $RES, ...) {...} + - focus-metavariable: $RES + pattern-sinks: + - pattern: sink(...) diff --git a/tests/rules/cross_function_tainting/test_same_name_functions.go b/tests/rules/cross_function_tainting/test_same_name_functions.go index 7b9c850d3..875f558cd 100644 --- a/tests/rules/cross_function_tainting/test_same_name_functions.go +++ b/tests/rules/cross_function_tainting/test_same_name_functions.go @@ -11,6 +11,7 @@ func test(input string) { func test(input string) { var fn = func(s string) { // ok: taint-func-param - sink(s) + safe(s) } + fn("") } From 217f5332c094833ce785ec68c3c06e8c64a1f2e1 Mon Sep 17 00:00:00 2001 From: corneliuhoffman Date: Thu, 16 Apr 2026 13:59:24 +0100 Subject: [PATCH 16/16] fixed Effects.filter_map, inherited preconds --- src/engine/Match_tainting_mode.ml | 50 ++++++++++++++----------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/src/engine/Match_tainting_mode.ml b/src/engine/Match_tainting_mode.ml index 95c06ab4c..4ff10a10e 100644 --- a/src/engine/Match_tainting_mode.ml +++ b/src/engine/Match_tainting_mode.ml @@ -635,35 +635,31 @@ let check_rule per_file_formula_cache (rule : R.taint_rule) match_hook Effects whose taint is purely parameterized (BArg) still ride through the signature at resolved call sites; effects mixing both get an Src-only slice surfaced here. *) + let keep_src_toSink_only (eff : Effect.t) : Effect.t option = + match eff with + | Effect.ToSink si -> + let items, precond = si.taints_with_precondition in + let src_items = + List.filter + (fun (i : Effect.taint_to_sink_item) -> + match i.taint.orig with + | Taint.Src _ -> true + | _ -> false) + items + in + if List_.null src_items then None + else + Some + (Effect.ToSink + { + si with + taints_with_precondition = (src_items, precond); + }) + | _ -> None + in let effects_to_record = if info.is_lambda_assignment then - fdef_effects - |> Effects.elements - |> List.filter_map (fun eff -> - match eff with - | Effect.ToSink sink_info -> - let items, _ = - sink_info.taints_with_precondition - in - let src_items = - List.filter - (fun (item : Effect.taint_to_sink_item) -> - match item.taint.orig with - | Taint.Src _ -> true - | _ -> false) - items - in - if List_.null src_items then None - else - Some - (Effect.ToSink - { - sink_info with - taints_with_precondition = - (src_items, Rule.PBool true); - }) - | _ -> None) - |> Effects.of_list + Effects.filter_map keep_src_toSink_only fdef_effects else fdef_effects in record_matches effects_to_record;