diff --git a/core/opentaint-java-querylang/samples/src/main/java/example/JoinTagUnion.java b/core/opentaint-java-querylang/samples/src/main/java/example/JoinTagUnion.java new file mode 100644 index 000000000..88725b046 --- /dev/null +++ b/core/opentaint-java-querylang/samples/src/main/java/example/JoinTagUnion.java @@ -0,0 +1,61 @@ +package example; + +import base.RuleSample; +import base.RuleSet; + +@RuleSet("example/JoinTagUnion.yaml") +public abstract class JoinTagUnion implements RuleSample { + + static Object srcA() { return null; } + static Object srcB() { return null; } + static void sinkX(Object v) {} + static void sinkY(Object v) {} + + /** Positive: source A into sink X. */ + static class PositiveAtoX extends JoinTagUnion { + @Override public void entrypoint() { + Object a = srcA(); + sinkX(a); + } + } + + /** Positive: source B into sink Y (second tag-expanded source, second sink). */ + static class PositiveBtoY extends JoinTagUnion { + @Override public void entrypoint() { + Object b = srcB(); + sinkY(b); + } + } + + /** Positive: source A into sink Y (cross-path; proves the tag union is not source-paired). */ + static class PositiveAtoY extends JoinTagUnion { + @Override public void entrypoint() { + Object v = srcA(); + sinkY(v); + } + } + + /** Positive: source B into sink X (cross-path; proves the tag union is not source-paired). */ + static class PositiveBtoX extends JoinTagUnion { + @Override public void entrypoint() { + Object v = srcB(); + sinkX(v); + } + } + + /** Negative: untainted value into a sink. */ + static class NegativeCleanIntoSink extends JoinTagUnion { + @Override public void entrypoint() { + Object c = "safe"; + sinkX(c); + } + } + + /** Negative: tainted source never reaches a sink. */ + static class NegativeNoSink extends JoinTagUnion { + @Override public void entrypoint() { + Object a = srcA(); + System.out.println(a); + } + } +} diff --git a/core/opentaint-java-querylang/samples/src/main/resources/example/JoinTagUnion.yaml b/core/opentaint-java-querylang/samples/src/main/resources/example/JoinTagUnion.yaml new file mode 100644 index 000000000..fd636de6d --- /dev/null +++ b/core/opentaint-java-querylang/samples/src/main/resources/example/JoinTagUnion.yaml @@ -0,0 +1,51 @@ +rules: + - id: tag-source-a + options: { lib: true } + tags: [demo-untrusted] + severity: NOTE + message: source a + languages: [java] + patterns: + - pattern: $V = srcA() + + - id: tag-source-b + options: { lib: true } + tags: [demo-untrusted] + severity: NOTE + message: source b + languages: [java] + patterns: + - pattern: $V = srcB() + + - id: tag-sink-x + options: { lib: true } + severity: NOTE + message: sink x + languages: [java] + patterns: + - pattern: sinkX($V) + + - id: tag-sink-y + options: { lib: true } + severity: NOTE + message: sink y + languages: [java] + patterns: + - pattern: sinkY($V) + + - id: join-tag-union + severity: ERROR + message: Tag union across two sources into two sinks + languages: [java] + mode: join + join: + refs: + - tag: demo-untrusted + as: untrusted + - rule: tag-sink-x + as: sinkx + - rule: tag-sink-y + as: sinky + on: + - 'untrusted.$V -> sinkx.$V' + - 'untrusted.$V -> sinky.$V' diff --git a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoadErrorMessage.kt b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoadErrorMessage.kt index 2a6634deb..ee2a59d48 100644 --- a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoadErrorMessage.kt +++ b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoadErrorMessage.kt @@ -224,11 +224,8 @@ class JoinRuleWithNoOperations : UnsupportedFeatureBlockingMessage() { } class JoinRuleWithChainedOperations : UnsupportedFeatureBlockingMessage() { - override val message: String = "Join rule with chained operations is not supported; only a single join condition is allowed" -} - -class JoinRuleWithMultipleDistinctRightItems : UnsupportedFeatureBlockingMessage() { - override val message: String = "Join rule references multiple distinct right-hand rules; only a single right-hand rule is supported" + override val message: String = + "Join rule chains an alias as both a source and a sink; intermediate (chained) nodes are not supported" } class JoinOnTaintRuleWithNonEmptySources : RuleIssueBlockingMessage() { @@ -251,6 +248,36 @@ class JoinIsImpossibleNoLabelFound(label: String) : RuleIssueBlockingMessage() { override val message: String = "Join is impossible: taint label '$label' required by the join condition was not found in the left-hand rule" } +class EmptyTagExpansion(tag: String) : RuleIssueBlockingMessage() { + override val message: String = + "Join ref targets tag '$tag', but no rule declares that tag" +} + +class JoinRefMissingTarget : RuleIssueBlockingMessage() { + override val message: String = + "Join ref must specify exactly one of 'rule' or 'tag', but neither was given" +} + +class JoinRefAmbiguousTarget : RuleIssueBlockingMessage() { + override val message: String = + "Join ref must specify exactly one of 'rule' or 'tag', but both were given" +} + +class JoinRefToUnsupportedRuleKind(ruleId: String) : RuleIssueBlockingMessage() { + override val message: String = + "Join ref resolves to '$ruleId', which is a join rule; only search/taint rules may be wired into a join" +} + +class JoinRefDuplicateAlias(alias: String) : RuleIssueBlockingMessage() { + override val message: String = + "Join alias '$alias' is declared by more than one ref; each ref must use a distinct 'as' alias (use a single 'tag' ref to union several rules under one alias)" +} + +class JoinAliasMetavarConflict(alias: String) : RuleIssueBlockingMessage() { + override val message: String = + "Join alias '$alias' is referenced with conflicting metavariables across 'on' conditions; an alias must use a single metavariable on each side" +} + class FailedToConvertToTaintRule(causeMessage: String?) : InternalWarningBlockingMessage() { override val message: String = "Failed to convert automata to a taint rule: ${causeMessage ?: "unknown error"}" } diff --git a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoader.kt b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoader.kt index 0bb2fe49e..9703e81f4 100644 --- a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoader.kt +++ b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepRuleLoader.kt @@ -56,6 +56,7 @@ class SemgrepRuleLoader( ) private val registeredRules = hashMapOf() + private val tagIndex = hashMapOf>() fun registerRuleSet( ruleSetText: String, @@ -95,6 +96,15 @@ class SemgrepRuleLoader( semgrepFileTrace.info("Register ${supportedRules.size} rules") } + private fun buildTagIndex() { + tagIndex.clear() + for (registered in registeredRules.values) { + for (tag in registered.rule.tags) { + tagIndex.getOrPut(tag, ::mutableListOf).add(registered.ruleId) + } + } + } + private fun registerRule(rule: RegisteredRule) { if (rule.ruleId in registeredRules) { rule.ruleTrace.stepTrace(Step.LOAD_RULESET) @@ -119,6 +129,8 @@ class SemgrepRuleLoader( registeredRules.values.toList() .forEach { parseRule(it, forceLibraryMode = false) } + buildTagIndex() + resolveRuleOverrides() parsedRules.values @@ -385,14 +397,18 @@ class SemgrepRuleLoader( } private fun buildJoinRule(rule: JoinRule<*>, trace: SemgrepRuleLoadStepTrace): TaintAutomataJoinRule? { - val items = hashMapOf() + val items = hashMapOf>() val itemRenames = hashMapOf>>() val strategy = strategyFor(rule.info) ?: return null for (ref in rule.refs) { - val refId = resolveRefRuleId(ref.rule, rule.info.pathInfo.ruleRelativePath) - val itemAutomata = resolveBuiltRuleWrtOverrides(refId, trace, hashSetOf()) + if (ref.`as` in items) { + trace.error(JoinRefDuplicateAlias(ref.`as`)) + return null + } + + val refIds = resolveRefTargets(ref, rule.info.pathInfo.ruleRelativePath, trace) ?: return null val renames = ref.renames.map { @@ -401,7 +417,16 @@ class SemgrepRuleLoader( Pair(from, to) } - items[ref.`as`] = TaintAutomataJoinRuleItem(itemAutomata.info.ruleId, itemAutomata.rule) + val aliasItems = items.getOrPut(ref.`as`, ::mutableListOf) + for (refId in refIds) { + if (parsedRules[refId] is JoinRule<*>) { + trace.error(JoinRefToUnsupportedRuleKind(refId)) + return null + } + val itemAutomata = resolveBuiltRuleWrtOverrides(refId, trace, hashSetOf()) + ?: return null + aliasItems += TaintAutomataJoinRuleItem(itemAutomata.info.ruleId, itemAutomata.rule) + } itemRenames[ref.`as`] = renames } @@ -422,7 +447,31 @@ class SemgrepRuleLoader( return null } - return TaintAutomataJoinRule(items, operations) + return TaintAutomataJoinRule(items.mapValues { it.value.toList() }, operations) + } + + private fun resolveRefTargets( + ref: SemgrepYamlJoinRuleRef, + ruleRelativePath: Path, + trace: SemgrepRuleLoadStepTrace + ): List? { + val hasRule = ref.rule != null + val hasTag = ref.tag != null + if (hasRule == hasTag) { + trace.error(if (hasRule) JoinRefAmbiguousTarget() else JoinRefMissingTarget()) + return null + } + + if (hasRule) { + return listOf(resolveRefRuleId(ref.rule!!, ruleRelativePath)) + } + + val matched = tagIndex[ref.tag] + if (matched.isNullOrEmpty()) { + trace.error(EmptyTagExpansion(ref.tag!!)) + return null + } + return matched.distinct().sorted() } private fun LanguageStrategy<*, *>.parseJoinMetaVarWithRenames( diff --git a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepYamlParsing.kt b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepYamlParsing.kt index 584bb6524..fc6fe6c32 100644 --- a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepYamlParsing.kt +++ b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/SemgrepYamlParsing.kt @@ -26,6 +26,7 @@ data class SemgrepYamlRuleSet( @Serializable data class SemgrepYamlRule( val id: String, + val tags: List = emptyList(), val languages: List? = null, val pattern: String? = null, val mode: String? = null, @@ -56,7 +57,8 @@ data class SemgrepYamlJoinRule( @Serializable data class SemgrepYamlJoinRuleRef( - val rule: String, + val rule: String? = null, + val tag: String? = null, @SerialName("as") val `as`: String, val renames: List = emptyList() diff --git a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/conversion/taint/JoinRuleProcessing.kt b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/conversion/taint/JoinRuleProcessing.kt index 2d414a1cf..5b4a9a938 100644 --- a/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/conversion/taint/JoinRuleProcessing.kt +++ b/core/opentaint-java-querylang/src/main/kotlin/org/opentaint/semgrep/pattern/conversion/taint/JoinRuleProcessing.kt @@ -2,11 +2,11 @@ package org.opentaint.semgrep.pattern.conversion.taint import org.opentaint.dataflow.configuration.jvm.serialized.PositionBase import org.opentaint.semgrep.pattern.ComplexMetavarInJoin +import org.opentaint.semgrep.pattern.JoinAliasMetavarConflict import org.opentaint.semgrep.pattern.GeneratedTaintMark import org.opentaint.semgrep.pattern.JoinIsImpossibleNoLabelFound import org.opentaint.semgrep.pattern.JoinOnTaintRuleWithNonEmptySources import org.opentaint.semgrep.pattern.JoinRuleWithChainedOperations -import org.opentaint.semgrep.pattern.JoinRuleWithMultipleDistinctRightItems import org.opentaint.semgrep.pattern.JoinRuleWithNoOperations import org.opentaint.semgrep.pattern.JoinRuleWithUnsupportedOperation import org.opentaint.semgrep.pattern.LeftTaintRuleMustHaveSources @@ -27,7 +27,7 @@ import org.opentaint.semgrep.pattern.conversion.TaintRuleStrategy import org.opentaint.semgrep.pattern.conversion.taint.composition.JoinRightCompositionStrategy data class TaintAutomataJoinRule( - val items: Map, + val items: Map>, val operations: List ) @@ -37,7 +37,7 @@ data class TaintAutomataJoinRuleItem( ) data class TaintAutomataJoinMetaVarRef( - val itemId: String, + val alias: String, val metaVar: MetavarAtom ) @@ -47,6 +47,11 @@ data class TaintAutomataJoinOperation( val rhs: TaintAutomataJoinMetaVarRef ) +/** + * A multi-sink join: one or more sources are wired directly to one or more sinks. Tag-expanded + * aliases contribute every rule they resolve to, so a single `on` line can fan a union of sources + * into a union of sinks. Chaining (an alias acting as both a source and a sink) is not supported. + */ fun RuleConversionCtx.convertTaintAutomataJoinToTaintRules( strategy: TaintRuleStrategy, rule: TaintAutomataJoinRule @@ -56,196 +61,183 @@ fun RuleConversionCtx.convertTaintAutomataJoinToTain trace.error(JoinRuleWithUnsupportedOperation(nonComposeOp.op)) return null } - if (rule.operations.isEmpty()) { trace.error(JoinRuleWithNoOperations()) return null } - if (!validateNoChainedOperations(rule.operations)) { + val sourceAliases = rule.operations.mapTo(linkedSetOf()) { it.lhs.alias } + val sinkAliases = rule.operations.mapTo(linkedSetOf()) { it.rhs.alias } + if (sourceAliases.intersect(sinkAliases).isNotEmpty()) { trace.error(JoinRuleWithChainedOperations()) return null } - val operationsByRightItem = rule.operations.groupBy { it.rhs } - if (operationsByRightItem.size > 1) { - trace.error(JoinRuleWithMultipleDistinctRightItems()) - return null + // Each alias must use a single metavariable on the side it appears on. + val sourceVar = hashMapOf() + val sinkVar = hashMapOf() + for (op in rule.operations) { + val prevSource = sourceVar.put(op.lhs.alias, op.lhs.metaVar) + if (prevSource != null && prevSource != op.lhs.metaVar) { + trace.error(JoinAliasMetavarConflict(op.lhs.alias)) + return null + } + val prevSink = sinkVar.put(op.rhs.alias, op.rhs.metaVar) + if (prevSink != null && prevSink != op.rhs.metaVar) { + trace.error(JoinAliasMetavarConflict(op.rhs.alias)) + return null + } } - val (rightItemRef, compositions) = operationsByRightItem.entries.first() - val leftItemRefs = compositions.map { it.lhs } - return convertCompositionJoinOperations(strategy, rule, rightItemRef, leftItemRefs) -} - -private fun validateNoChainedOperations(operations: List): Boolean { - val leftItems = operations.map { it.lhs.itemId }.toSet() - val rightItems = operations.map { it.rhs.itemId }.toSet() - return leftItems.intersect(rightItems).isEmpty() -} - -private fun RuleConversionCtx.convertCompositionJoinOperations( - strategy: TaintRuleStrategy, - rule: TaintAutomataJoinRule, - rightItemRef: TaintAutomataJoinMetaVarRef, - leftItemRefs: List, -): TaintRuleFromSemgrep? { - val allLeftRules = mutableListOf>() - val allLeftFinalMarks = hashSetOf() - - for (leftItemRef in leftItemRefs) { - val leftItem = rule.items.getValue(leftItemRef.itemId) - val leftAutomata = leftItem.rule - - val leftCtx = RuleConversionCtx("$ruleId#${leftItemRef.itemId}", meta, trace, typeOps) - val (leftRules, leftFinalMarks) = leftCtx.convertCompositionLeftRule( - strategy, leftAutomata, leftItemRef.metaVar - ) ?: return null - - allLeftRules.addAll(leftRules) - allLeftFinalMarks.addAll(leftFinalMarks) + val allGroups = mutableListOf>() + + // Convert each source once and remember the marks it produces. + val marksBySource = hashMapOf>() + for (alias in sourceAliases) { + val produced = hashSetOf() + rule.items.getValue(alias).forEachIndexed { idx, item -> + val nodeCtx = RuleConversionCtx("$ruleId#$alias#$idx", meta, trace, typeOps) + val (groups, marks) = nodeCtx.convertCompositionSourceRule(strategy, item.rule, sourceVar.getValue(alias)) + ?: return null + allGroups += groups + produced += marks + } + marksBySource[alias] = produced } - val rightItem = rule.items.getValue(rightItemRef.itemId) - val rightAutomata = rightItem.rule - val rightRules = when (rightAutomata) { - is SemgrepMatchingRule -> convertCompositionRightMatchingRule( - strategy, rightAutomata, rightItemRef.metaVar, allLeftFinalMarks - ) - - is SemgrepTaintRule -> convertCompositionRightTaintRule( - strategy, rightAutomata, rightItemRef.metaVar, allLeftFinalMarks - ) ?: return null + // Each sink consumes the marks of the sources wired to it. + for (alias in sinkAliases) { + val incomingMarks = rule.operations + .filter { it.rhs.alias == alias } + .flatMapTo(hashSetOf()) { marksBySource[it.lhs.alias].orEmpty() } + rule.items.getValue(alias).forEachIndexed { idx, item -> + val nodeCtx = RuleConversionCtx("$ruleId#$alias#$idx", meta, trace, typeOps) + allGroups += nodeCtx.convertCompositionSinkRule(strategy, item.rule, sinkVar.getValue(alias), incomingMarks) + ?: return null + } } - return TaintRuleFromSemgrep(ruleId, allLeftRules + rightRules) + return TaintRuleFromSemgrep(ruleId, allGroups) } -private fun RuleConversionCtx.convertCompositionLeftRule( +private fun RuleConversionCtx.convertCompositionSourceRule( strategy: TaintRuleStrategy, automata: SemgrepRule>, finalVar: MetavarAtom, -): Pair>, Set>? { - return when (automata) { - is SemgrepMatchingRule -> convertCompositionLeftMatchingRule(strategy, automata, finalVar) - is SemgrepTaintRule -> convertCompositionLeftTaintRule(strategy, automata, finalVar) - } +): Pair>, Set>? = when (automata) { + is SemgrepMatchingRule -> convertCompositionLeftMatchingRule(strategy, automata, finalVar) + is SemgrepTaintRule -> convertCompositionLeftTaintRule(strategy, automata, finalVar) } -private fun RuleConversionCtx.convertCompositionLeftMatchingRule( +private fun RuleConversionCtx.convertCompositionSinkRule( + strategy: TaintRuleStrategy, + automata: SemgrepRule>, + initialVar: MetavarAtom, + sourceMarks: Set, +): List>? = when (automata) { + is SemgrepMatchingRule -> convertCompositionRightMatchingRule(strategy, automata, initialVar, sourceMarks) + is SemgrepTaintRule -> convertCompositionRightTaintRule(strategy, automata, sourceMarks) +} + +/** + * How a matching-rule join node is wired into the surrounding flow. [buildMatchingNode] derives every + * knob (which metavars anchor the automaton, which edges to fold, whether to inject upstream marks, + * which marks to produce) from the variant rather than from independently-settable flags. + * + * - [Source] produces marks at [produceAt] and consumes nothing. + * - [Sink] consumes [incomingMarks] at [consumeAt] and produces nothing. + */ +private sealed class MatchingFlow( + val consumeAt: MetavarAtom?, + val incomingMarks: Set, + val produceAt: MetavarAtom?, +) { + class Source(produceAt: MetavarAtom) : MatchingFlow(consumeAt = null, incomingMarks = emptySet(), produceAt = produceAt) + class Sink(consumeAt: MetavarAtom, incomingMarks: Set) : + MatchingFlow(consumeAt = consumeAt, incomingMarks = incomingMarks, produceAt = null) +} + +/** The single matching-rule node conversion shared by both join roles; see [MatchingFlow] for the wiring. */ +private fun RuleConversionCtx.buildMatchingNode( strategy: TaintRuleStrategy, automata: SemgrepMatchingRule>, - finalVar: MetavarAtom, + flow: MatchingFlow, ): Pair>, Set> { - val leftEdges = automata.flatMap { r -> + val edges = automata.flatMap { r -> val automataWithVars = TaintRegisterStateAutomataWithStateVars( - r.rule, initialStateVars = emptySet(), acceptStateVars = setOf(finalVar) + r.rule, initialStateVars = setOfNotNull(flow.consumeAt), acceptStateVars = setOfNotNull(flow.produceAt) ) - val taintEdges = safeConvertToTaintRules { - generateTaintAutomataEdges(automataWithVars, r.metaVarInfo) - } - listOfNotNull(taintEdges) + listOfNotNull(safeConvertToTaintRules { generateTaintAutomataEdges(automataWithVars, r.metaVarInfo) }) } - val leftCtx = leftEdges.rules.mapIndexed { idx, r -> - val taintEdgesWithAssign = r.copy( - edges = r.edges + r.edgesToFinalAccept, - edgesToFinalAccept = emptyList() - ) - TaintRuleGenerationCtx( - RuleUniqueMarkPrefix(ruleId, idx), - taintEdgesWithAssign, - compositionStrategy = null, - strategy, - ) + // A producer folds its accept edges into the body so its mark is assigned during generation; a pure + // sink discards trivially-satisfied conditions, every other role keeps them. + val foldAcceptEdges = flow.produceAt != null + val discardMode = if (flow is MatchingFlow.Sink) SinkDiscardMode.TRIVIAL_CONDITION else SinkDiscardMode.NONE + + val ctxs = edges.rules.mapIndexed { idx, r -> + val nodeEdges = if (foldAcceptEdges) { + r.copy(edges = r.edges + r.edgesToFinalAccept, edgesToFinalAccept = emptyList()) + } else { + r + } + val composition = flow.consumeAt?.let { JoinRightCompositionStrategy(nodeEdges, it, flow.incomingMarks, strategy) } + TaintRuleGenerationCtx(RuleUniqueMarkPrefix(ruleId, idx), nodeEdges, composition, strategy) } - val leftRules = leftCtx.mapNotNull { + val groups = ctxs.mapNotNull { safeConvertToTaintRules { - val generatedRules = strategy.generateTaintRules(it, this, SinkDiscardMode.NONE) - TaintRuleFromSemgrep.TaintRuleGroup(generatedRules) + TaintRuleFromSemgrep.TaintRuleGroup(strategy.generateTaintRules(it, this, discardMode)) } } - val leftFinalMarks = hashSetOf() - leftCtx.forEach { ctx -> - ctx.automata.finalAcceptStates.forEach { s -> - ctx.stateAssignMark(finalVar, s, PositionBase.Result.base()).forEach { assign -> - leftFinalMarks.add(strategy.assignedMark(assign)) + val producedMarks = hashSetOf() + val produceAt = flow.produceAt + if (produceAt != null) { + ctxs.forEach { ctx -> + ctx.automata.finalAcceptStates.forEach { s -> + ctx.stateAssignMark(produceAt, s, PositionBase.Result.base()).forEach { assign -> + producedMarks.add(strategy.assignedMark(assign)) + } } } } - return leftRules to leftFinalMarks + return groups to producedMarks } -private fun RuleConversionCtx.convertCompositionRightMatchingRule( +private fun RuleConversionCtx.convertCompositionLeftMatchingRule( strategy: TaintRuleStrategy, automata: SemgrepMatchingRule>, - initialVar: MetavarAtom, - leftFinalMarks: Set, -): List> { - val rightEdges = automata.flatMap { r -> - val automataWithVars = TaintRegisterStateAutomataWithStateVars( - r.rule, initialStateVars = setOf(initialVar), acceptStateVars = emptySet() - ) - val taintEdges = safeConvertToTaintRules { - generateTaintAutomataEdges(automataWithVars, r.metaVarInfo) - } - listOfNotNull(taintEdges) - } - - val rightCtx = rightEdges.rules.mapIndexed { idx, r -> - composeRuleJoinRight(r, initialVar, leftFinalMarks, strategy, idx) - } - - val rightRules = rightCtx.mapNotNull { - safeConvertToTaintRules { - val generatedRules = strategy.generateTaintRules(it, this, SinkDiscardMode.TRIVIAL_CONDITION) - TaintRuleFromSemgrep.TaintRuleGroup(generatedRules) - } - } - - return rightRules -} + finalVar: MetavarAtom, +): Pair>, Set> = + buildMatchingNode(strategy, automata, MatchingFlow.Source(finalVar)) -private fun RuleConversionCtx.composeRuleJoinRight( - r: TaintAutomataEdges, +private fun RuleConversionCtx.convertCompositionRightMatchingRule( + strategy: TaintRuleStrategy, + automata: SemgrepMatchingRule>, initialVar: MetavarAtom, leftFinalMarks: Set, - taintStrategy: TaintRuleStrategy, - idx: Int -): TaintRuleGenerationCtx { - val composition = JoinRightCompositionStrategy(r, initialVar, leftFinalMarks, taintStrategy) - return TaintRuleGenerationCtx(RuleUniqueMarkPrefix(ruleId, idx), r, composition, taintStrategy) -} +): List> = + buildMatchingNode(strategy, automata, MatchingFlow.Sink(initialVar, leftFinalMarks)).first private fun RuleConversionCtx.convertCompositionRightTaintRule( strategy: TaintRuleStrategy, automata: SemgrepTaintRule>, - @Suppress("UNUSED_PARAMETER") initialVar: MetavarAtom, - leftFinalMarks: Set, + sourceMarks: Set, ): List>? { if (automata.sources.isNotEmpty()) { trace.error(JoinOnTaintRuleWithNonEmptySources()) return null } - // note: we always treat initial var as taint source - return convertCompositionRightTaintRule(strategy, automata, leftFinalMarks) -} - -private fun RuleConversionCtx.convertCompositionRightTaintRule( - strategy: TaintRuleStrategy, - automata: SemgrepTaintRule>, - sourceMarks: Set, -): List> { - val preparedRules = prepareTaintNonSourceRules( - automata, + // The upstream marks are this sink's taint sources. + return convertTaintNode( + strategy, automata, sources = emptyList(), - taintMarks = sourceMarks.mapTo(hashSetOf()) { GeneratedTaintMark(it) } + taintMarks = sourceMarks.mapTo(hashSetOf()) { GeneratedTaintMark(it) }, + ignoreEmptySources = true, ) - return convertTaintRuleToTaintRules(strategy, preparedRules, ignoreEmptySources = true).taintRules } private fun RuleConversionCtx.convertCompositionLeftTaintRule( @@ -268,9 +260,7 @@ private fun RuleConversionCtx.convertCompositionLeft return null } - val finalLabels = automata.sources - .mapNotNull { it.label } - .filter { it.label == finalVar.name } + val finalLabels = automata.labelsMatching(finalVar.name) if (finalLabels.isEmpty()) { trace.error(JoinIsImpossibleNoLabelFound(finalVar.name)) @@ -286,15 +276,31 @@ private fun RuleConversionCtx.convertCompositionLeft finalLabels: List, ): Pair>, Set> { val (sources, taintMarks) = prepareTaintSourceRules(automata) - - val preparedRules = prepareTaintNonSourceRules( - automata, - sources = sources, - taintMarks = taintMarks - ) - - val result = convertTaintRuleToTaintRules(strategy, preparedRules, ignoreEmptySources = false) - + val groups = convertTaintNode(strategy, automata, sources, taintMarks, ignoreEmptySources = false) val finalMarks = finalLabels.mapTo(hashSetOf()) { taintMark(it) } - return result.taintRules to finalMarks + return groups to finalMarks } + +/** + * The taint-rule node spine: inject [taintMarks] (own source marks and/or upstream marks) as taint + * sources, then convert. Source and sink roles differ only in what they feed here and in how they + * derive their downstream marks. + */ +private fun RuleConversionCtx.convertTaintNode( + strategy: TaintRuleStrategy, + automata: SemgrepTaintRule>, + sources: List>>, + taintMarks: Set, + ignoreEmptySources: Boolean, +): List> = + convertTaintRuleToTaintRules( + strategy, + prepareTaintNonSourceRules(automata, sources = sources, taintMarks = taintMarks), + ignoreEmptySources, + ).taintRules + +/** Source labels of this taint rule whose name matches [varName] (the join metavar on this side). */ +private fun SemgrepTaintRule>.labelsMatching( + varName: String, +): List = + sources.mapNotNull { it.label }.filter { it.label == varName } diff --git a/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/ExampleTest.kt b/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/ExampleTest.kt index 9a874949f..dd4eee2cb 100644 --- a/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/ExampleTest.kt +++ b/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/ExampleTest.kt @@ -198,6 +198,9 @@ class ExampleTest : SampleBasedTest() { @Test fun `test join with taint and matching left`() = runTest() + @Test + fun `test join tag union`() = runTest() + @Test fun `test object mapper pattern-not full`() = runTest() diff --git a/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/pattern/RuleTagsAndJoinTest.kt b/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/pattern/RuleTagsAndJoinTest.kt new file mode 100644 index 000000000..73ff2b860 --- /dev/null +++ b/core/opentaint-java-querylang/src/test/kotlin/org/opentaint/semgrep/pattern/RuleTagsAndJoinTest.kt @@ -0,0 +1,400 @@ +package org.opentaint.semgrep.pattern + +import org.opentaint.semgrep.pattern.conversion.JavaLanguageStrategy +import kotlin.io.path.Path +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +class RuleTagsAndJoinTest { + + private fun parse(yml: String): SemgrepYamlRuleSet { + val trace = SemgrepFileLoadTrace("test.yaml") + return parseSemgrepYaml(yml, trace) ?: error("parse returned null; errors=${trace.errorMessages()}") + } + + @Test + fun `rule tags parse into a list`() { + val rs = parse( + """ + rules: + - id: tagged + severity: NOTE + message: m + languages: [java] + tags: + - untrusted-data-source + - ssrf-source + pattern: foo() + """.trimIndent() + ) + assertEquals(listOf("untrusted-data-source", "ssrf-source"), rs.rules.single().tags) + } + + @Test + fun `rule without tags defaults to empty`() { + val rs = parse( + """ + rules: + - id: untagged + severity: NOTE + message: m + languages: [java] + pattern: foo() + """.trimIndent() + ) + assertTrue(rs.rules.single().tags.isEmpty()) + } + + @Test + fun `join ref accepts either rule or tag`() { + val rs = parse( + """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - tag: untrusted-data-source + as: untrusted-data + - rule: lib.yaml#sink + as: sink + on: + - 'untrusted-data.${'$'}X -> sink.${'$'}X' + """.trimIndent() + ) + val refs = rs.rules.single().join!!.refs + assertEquals("untrusted-data-source", refs[0].tag) + assertEquals(null, refs[0].rule) + assertEquals("lib.yaml#sink", refs[1].rule) + assertEquals(null, refs[1].tag) + } + + private fun load(vararg files: Pair): Pair { + val trace = SemgrepLoadTrace() + val loader = SemgrepRuleLoader(listOf(JavaLanguageStrategy())) + for ((path, text) in files) { + loader.registerRuleSet(text, Path(path), Path("."), trace) + } + return loader.loadRules() to trace + } + + private fun loadedRuleIds(r: SemgrepRuleLoader.RuleLoadResult): List = + r.rulesWithMeta.map { it.second.shortRuleId } + + private val sinkLib = "lib/sink.yaml" to """ + rules: + - id: ssrf-sink + options: { lib: true } + severity: NOTE + message: sink + languages: [java] + patterns: + - pattern: sink(${'$'}X) + """.trimIndent() + + private val servletSource = "lib/servlet.yaml" to """ + rules: + - id: servlet-source + options: { lib: true } + tags: [untrusted-data-source] + severity: NOTE + message: src + languages: [java] + patterns: + - pattern: servletSource() + """.trimIndent() + + @Test + fun `tag ref expanding to no rules is a hard error`() { + val (_, trace) = load( + sinkLib, + "ssrf.yaml" to """ + rules: + - id: ssrf + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - tag: untrusted-data-source + as: untrusted-data + - rule: lib/sink.yaml#ssrf-sink + as: sink + on: + - 'untrusted-data.${'$'}X -> sink.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("no rule declares that tag") }, trace.errorMessages().toString()) + } + + @Test + fun `ref with neither rule nor tag errors`() { + val (_, trace) = load( + sinkLib, + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - as: x + - rule: lib/sink.yaml#ssrf-sink + as: sink + on: + - 'x.${'$'}X -> sink.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("neither was given") }, trace.errorMessages().toString()) + } + + @Test + fun `ref with both rule and tag errors`() { + val (_, trace) = load( + sinkLib, servletSource, + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/servlet.yaml#servlet-source + tag: untrusted-data-source + as: x + - rule: lib/sink.yaml#ssrf-sink + as: sink + on: + - 'x.${'$'}X -> sink.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("both were given") }, trace.errorMessages().toString()) + } + + @Test + fun `two refs sharing one alias is rejected`() { + val (_, trace) = load( + sinkLib, servletSource, + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/servlet.yaml#servlet-source + as: dup + - rule: lib/sink.yaml#ssrf-sink + as: dup + on: + - 'dup.${'$'}X -> dup.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("declared by more than one ref") }, trace.errorMessages().toString()) + } + + @Test + fun `ref to a join rule errors`() { + val (_, trace) = load( + sinkLib, + "inner.yaml" to """ + rules: + - id: inner-join + options: { lib: true } + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/sink.yaml#ssrf-sink + as: s + on: + - 's.${'$'}X -> s.${'$'}X' + """.trimIndent(), + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/sink.yaml#ssrf-sink + as: src + - rule: inner.yaml#inner-join + as: sink + on: + - 'src.${'$'}X -> sink.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("is a join rule") }, trace.errorMessages().toString()) + } + + @Test + fun `conflicting alias metavars error`() { + val (_, trace) = load( + "lib/srcs.yaml" to """ + rules: + - id: a + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: a(${'$'}X) } ] + - id: b + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: b(${'$'}Y) } ] + """.trimIndent(), + sinkLib, + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/srcs.yaml#a + as: a + - rule: lib/srcs.yaml#b + as: b + - rule: lib/sink.yaml#ssrf-sink + as: sink + on: + - 'a.${'$'}X -> sink.${'$'}T' + - 'b.${'$'}Y -> sink.${'$'}U' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("conflicting metavariables") }, trace.errorMessages().toString()) + } + + @Test + fun `multi-sink join fans a tagged source union into several sinks`() { + val (result, trace) = load( + "lib/cmd.yaml" to """ + rules: + - id: src-a + options: { lib: true } + tags: [demo-src] + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: ${'$'}V = srcA() } ] + - id: src-b + options: { lib: true } + tags: [demo-src] + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: ${'$'}V = srcB() } ] + - id: sink-x + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: sinkX(${'$'}V) } ] + - id: sink-y + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: sinkY(${'$'}V) } ] + """.trimIndent(), + "j.yaml" to """ + rules: + - id: multi-sink + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - tag: demo-src + as: src + - rule: lib/cmd.yaml#sink-x + as: sinkx + - rule: lib/cmd.yaml#sink-y + as: sinky + on: + - 'src.${'$'}V -> sinkx.${'$'}V' + - 'src.${'$'}V -> sinky.${'$'}V' + """.trimIndent() + ) + assertTrue(trace.errorMessages().isEmpty(), trace.errorMessages().toString()) + assertTrue("multi-sink" in loadedRuleIds(result), "join rule should load; loaded=${loadedRuleIds(result)}") + } + + @Test + fun `chaining an alias as both source and sink is rejected`() { + val (_, trace) = load( + "lib/three.yaml" to """ + rules: + - id: a + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: a(${'$'}X) } ] + - id: b + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: b(${'$'}X) } ] + - id: c + options: { lib: true } + severity: NOTE + message: m + languages: [java] + patterns: [ { pattern: c(${'$'}X) } ] + """.trimIndent(), + "j.yaml" to """ + rules: + - id: j + severity: ERROR + message: m + languages: [java] + mode: join + join: + refs: + - rule: lib/three.yaml#a + as: a + - rule: lib/three.yaml#b + as: b + - rule: lib/three.yaml#c + as: c + on: + - 'a.${'$'}X -> b.${'$'}X' + - 'b.${'$'}X -> c.${'$'}X' + """.trimIndent() + ) + assertTrue(trace.errorMessages().any { it.contains("chains an alias") }, trace.errorMessages().toString()) + } +} + +// Test-only helpers for reading the load trace. +fun SemgrepFileLoadTrace.errorMessages(): List = + entries.filterIsInstance().map { it.message } + + ruleTraces.flatMap { rt -> + rt.entries.filterIsInstance().map { it.message } + + rt.steps.flatMap { st -> st.entries.filterIsInstance().map { it.message } } + } + +fun SemgrepLoadTrace.errorMessages(): List = fileTraces.flatMap { it.errorMessages() }