From bd91b5a7f88f8971865afeab8b91444a93275c8a Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Mon, 10 Nov 2025 10:50:18 +0530 Subject: [PATCH 1/4] minor improvements --- cmd/alterx/main.go | 202 ++++++++- internal/dank/dank.go | 559 +++++++++++++++++++++++ internal/patternmining/clustering.go | 52 +++ internal/patternmining/patternmining.go | 573 ++++++++++++++++++++++++ internal/patternmining/regex.go | 355 +++++++++++++++ internal/runner/runner.go | 17 + 6 files changed, 1744 insertions(+), 14 deletions(-) create mode 100644 internal/dank/dank.go create mode 100644 internal/patternmining/clustering.go create mode 100644 internal/patternmining/patternmining.go create mode 100644 internal/patternmining/regex.go diff --git a/cmd/alterx/main.go b/cmd/alterx/main.go index e87b4c96..3bed8a97 100644 --- a/cmd/alterx/main.go +++ b/cmd/alterx/main.go @@ -1,10 +1,13 @@ package main import ( + "context" "io" "os" + "strings" "github.com/projectdiscovery/alterx" + "github.com/projectdiscovery/alterx/internal/patternmining" "github.com/projectdiscovery/alterx/internal/runner" "github.com/projectdiscovery/gologger" ) @@ -13,17 +16,83 @@ func main() { cliOpts := runner.ParseFlags() + // Validate mode + if cliOpts.Mode != "default" && cliOpts.Mode != "discover" && cliOpts.Mode != "both" { + gologger.Fatal().Msgf("invalid mode: %s (must be 'default', 'discover', or 'both')", cliOpts.Mode) + } + + // Handle pattern mining modes (discover or both) + var minedPatterns []string + if cliOpts.Mode == "discover" || cliOpts.Mode == "both" { + target := extractTargetDomain(cliOpts.Domains) + if target == "" { + gologger.Fatal().Msgf("pattern mining requires domains with a common target (e.g., sub.example.com)") + } + + gologger.Info().Msgf("Pattern mining mode enabled (Go port of Regulator by @cramppet)") + gologger.Info().Msgf("Target domain: %s", target) + + miner := patternmining.NewMiner(&patternmining.Options{ + Domains: cliOpts.Domains, + Target: target, + MinDistance: cliOpts.MinDistance, + MaxDistance: cliOpts.MaxDistance, + PatternThreshold: cliOpts.PatternThreshold, + QualityRatio: float64(cliOpts.QualityRatio), + MaxLength: 1000, + NgramsLimit: cliOpts.NgramsLimit, + }) + + result, err := miner.Mine() + if err != nil { + gologger.Fatal().Msgf("pattern mining failed: %v", err) + } + + // Save rules if requested + if cliOpts.SaveRules != "" { + if err := miner.SaveRules(result, cliOpts.SaveRules); err != nil { + gologger.Error().Msgf("failed to save rules: %v", err) + } else { + gologger.Info().Msgf("Saved %d patterns to %s", len(result.Patterns), cliOpts.SaveRules) + } + } + + // Generate subdomains from discovered patterns + if cliOpts.Mode == "discover" { + // In discover mode, only use mined patterns + generated := miner.GenerateFromPatterns(result.Patterns) + + // Write output + output := getOutputWriter(cliOpts.Output) + defer closeOutput(output, cliOpts.Output) + + for _, subdomain := range generated { + if cliOpts.Limit > 0 && len(generated) >= cliOpts.Limit { + break + } + output.Write([]byte(subdomain + "\n")) + } + + gologger.Info().Msgf("Generated %d subdomains from discovered patterns", len(generated)) + return + } + + // In 'both' mode, collect mined patterns for combination + minedPatterns = result.Patterns + gologger.Info().Msgf("Discovered %d patterns, combining with user-defined patterns", len(minedPatterns)) + } + + // Handle default mode or 'both' mode alterOpts := alterx.Options{ Domains: cliOpts.Domains, Patterns: cliOpts.Patterns, Payloads: cliOpts.Payloads, Limit: cliOpts.Limit, - Enrich: cliOpts.Enrich, // enrich payloads - MaxSize: cliOpts.MaxSize, + Enrich: cliOpts.Enrich, + MaxSize: cliOpts.MaxSize, } if cliOpts.PermutationConfig != "" { - // read config config, err := alterx.NewConfig(cliOpts.PermutationConfig) if err != nil { gologger.Fatal().Msgf("failed to read %v file got: %v", cliOpts.PermutationConfig, err) @@ -36,32 +105,137 @@ func main() { } } - // configure output writer - var output io.Writer - if cliOpts.Output != "" { - fs, err := os.OpenFile(cliOpts.Output, os.O_CREATE|os.O_WRONLY, 0644) + // In 'both' mode, add mined patterns to user patterns + if cliOpts.Mode == "both" && len(minedPatterns) > 0 { + // Convert mined patterns to alterx format + // Mined patterns are already in regex format, but alterx expects template format + // For now, we'll generate from mined patterns separately and combine results + target := extractTargetDomain(cliOpts.Domains) + miner := patternmining.NewMiner(&patternmining.Options{ + Domains: cliOpts.Domains, + Target: target, + MinDistance: cliOpts.MinDistance, + MaxDistance: cliOpts.MaxDistance, + PatternThreshold: cliOpts.PatternThreshold, + QualityRatio: float64(cliOpts.QualityRatio), + MaxLength: 1000, + NgramsLimit: cliOpts.NgramsLimit, + }) + + generated := miner.GenerateFromPatterns(minedPatterns) + + // Use a dedupe set for both modes + allResults := make(map[string]bool) + for _, g := range generated { + allResults[g] = true + } + + // Now run the normal alterx generation + output := getOutputWriter(cliOpts.Output) + defer closeOutput(output, cliOpts.Output) + + m, err := alterx.New(&alterOpts) if err != nil { - gologger.Fatal().Msgf("failed to open output file %v got %v", cliOpts.Output, err) + gologger.Fatal().Msgf("failed to parse alterx config got %v", err) } - output = fs - defer fs.Close() - } else { - output = os.Stdout + + if cliOpts.Estimate { + estimated := m.EstimateCount() + len(generated) + gologger.Info().Msgf("Estimated Payloads (including duplicates): %v", estimated) + return + } + + // First write mined results + count := 0 + for subdomain := range allResults { + if cliOpts.Limit > 0 && count >= cliOpts.Limit { + break + } + output.Write([]byte(subdomain + "\n")) + count++ + } + + // Then write alterx results (with deduplication) + if err = executeAlterxWithDedup(m, output, allResults, cliOpts.Limit-count); err != nil { + gologger.Error().Msgf("failed to write output to file got %v", err) + } + + gologger.Info().Msgf("Generated %d total unique subdomains (both modes)", len(allResults)) + return } - // create new alterx instance with options + // Standard default mode + output := getOutputWriter(cliOpts.Output) + defer closeOutput(output, cliOpts.Output) + m, err := alterx.New(&alterOpts) if err != nil { gologger.Fatal().Msgf("failed to parse alterx config got %v", err) } if cliOpts.Estimate { - gologger.Info().Msgf("Estimated Payloads (including duplicates) : %v", m.EstimateCount()) + gologger.Info().Msgf("Estimated Payloads (including duplicates): %v", m.EstimateCount()) return } if err = m.ExecuteWithWriter(output); err != nil { gologger.Error().Msgf("failed to write output to file got %v", err) } +} +// extractTargetDomain extracts the common target domain from input domains +func extractTargetDomain(domains []string) string { + if len(domains) == 0 { + return "" + } + + // Take the first domain and extract root domain + first := domains[0] + parts := strings.Split(first, ".") + if len(parts) >= 2 { + // Return last two parts as target domain (e.g., "example.com") + return strings.Join(parts[len(parts)-2:], ".") + } + return first +} + +// getOutputWriter returns the appropriate output writer +func getOutputWriter(outputPath string) io.Writer { + if outputPath != "" { + fs, err := os.OpenFile(outputPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + gologger.Fatal().Msgf("failed to open output file %v got %v", outputPath, err) + } + return fs + } + return os.Stdout +} + +// closeOutput closes the output writer if it's a file +func closeOutput(output io.Writer, outputPath string) { + if outputPath != "" { + if closer, ok := output.(io.Closer); ok { + closer.Close() + } + } +} + +// executeAlterxWithDedup executes alterx with deduplication against existing results +func executeAlterxWithDedup(m *alterx.Mutator, output io.Writer, existing map[string]bool, remainingLimit int) error { + // We need to capture alterx output and dedupe it + // Create a custom writer that dedupes + count := 0 + resChan := m.Execute(context.TODO()) + + for value := range resChan { + if remainingLimit > 0 && count >= remainingLimit { + continue + } + if !existing[value] && !strings.HasPrefix(value, "-") { + existing[value] = true + output.Write([]byte(value + "\n")) + count++ + } + } + return nil } diff --git a/internal/dank/dank.go b/internal/dank/dank.go new file mode 100644 index 00000000..0c4b776e --- /dev/null +++ b/internal/dank/dank.go @@ -0,0 +1,559 @@ +package dank + +import ( + "fmt" + "math/big" + "regexp" + "sort" + "strconv" + "strings" +) + +// DankEncoder implementation matching Python's C++ backend exactly +// Uses Brzozowski's algorithm for DFA minimization + +// preprocessRegex expands character classes like [1-5] to (1|2|3|4|5) +func preprocessRegex(regex string) string { + // Find all character classes of form [x-y] + re := regexp.MustCompile(`\[(.)\-(.)\]`) + + for { + match := re.FindStringSubmatchIndex(regex) + if match == nil { + break + } + + // Extract start and end characters + startChar := regex[match[2]] + endChar := regex[match[4]] + + // Expand range + var elements []string + for ch := startChar; ch <= endChar; ch++ { + elements = append(elements, string(ch)) + } + expanded := "(" + strings.Join(elements, "|") + ")" + + // Replace in regex + fullMatch := regex[match[0]:match[1]] + regex = strings.Replace(regex, fullMatch, expanded, 1) + } + + return regex +} + +// NFAState represents a state in the Thompson NFA (C++ style). +type NFAState struct { + ID int + Trans map[byte]map[int]bool // byte -> set of state IDs + IsFinal bool +} + +// DFAState represents a DFA state (subset of NFA states). +type DFAState struct { + ID int + NFAIDs []int // sorted for key + Trans map[byte]int + IsFinal bool +} + +// DankEncoder is the main struct matching Python's C++ backend. +type DankEncoder struct { + regex string + alphabet []byte + nfa []*NFAState + initStates map[int]bool + dfaStates map[string]int + dfa []*DFAState + fixedSlice int + stateCounter int +} + +// NewDankEncoder initializes and builds the automaton using C++ algorithm. +func NewDankEncoder(regexStr string, fixedSlice int) *DankEncoder { + dnsAlphabet := []byte("abcdefghijklmnopqrstuvwxyz0123456789._-") + + // Preprocess regex to expand character classes (like Python's preprocess) + preprocessed := preprocessRegex(regexStr) + + d := &DankEncoder{ + regex: preprocessed, + alphabet: dnsAlphabet, + fixedSlice: fixedSlice, + stateCounter: 0, + } + + // Build NFA using C++ algorithm + d.buildNFAFromRegex(preprocessed) + + // Build DFA using Brzozowski's algorithm (like Python's C++) + // determinize -> reverse -> determinize -> reverse -> determinize + d.buildDFA() + d.reverseDFA() + d.buildDFA() + d.reverseDFA() + d.buildDFA() + + return d +} + +// buildNFAFromRegex constructs NFA matching C++ from_regex +func (d *DankEncoder) buildNFAFromRegex(regex string) { + // Initialize: create states 0 and 1, state 1 is final + d.nfa = []*NFAState{ + {ID: 0, Trans: make(map[byte]map[int]bool)}, + {ID: 1, Trans: make(map[byte]map[int]bool), IsFinal: true}, + } + d.stateCounter = 2 + + // Build from start=0 to end=1 + d.fromRegex(0, 1, regex) + + // Get epsilon closure of initial state + d.initStates = make(map[int]bool) + d.initStates[0] = true + d.epsilonClosure(d.initStates) +} + +// fromRegex implements _from_regex from C++ +func (d *DankEncoder) fromRegex(s, t int, pattern string) { + if len(pattern) == 0 { + d.insertNFA(s, 0, t) // epsilon transition + return + } + + // Single character (including escaped) + if len(pattern) == 1 { + d.insertNFA(s, pattern[0], t) + return + } + + // Escaped character + if len(pattern) == 2 && pattern[0] == '\\' { + d.insertNFA(s, pattern[1], t) + return + } + + // Find rightmost top-level | or concatenation point + optionPos := -1 + concatPos := -1 + depth := 0 + + for i := 0; i < len(pattern); i++ { + ch := pattern[i] + + switch ch { + case '\\': + if depth == 0 { + concatPos = i + } + i++ // Skip next char + case '(': + if depth == 0 { + concatPos = i + } + depth++ + case ')': + depth-- + case '|': + if depth == 0 { + optionPos = i + } + case '?', '*', '+': + // Don't update concat for operators + default: + if depth == 0 { + concatPos = i + } + } + } + + // Handle alternation (|) + if optionPos >= 0 { + // Create intermediate states + i0 := d.newState() + i1 := d.newState() + d.insertNFA(s, 0, i0) // epsilon + d.insertNFA(i1, 0, t) // epsilon + d.fromRegex(i0, i1, pattern[:optionPos]) + + i0 = d.newState() + i1 = d.newState() + d.insertNFA(s, 0, i0) // epsilon + d.insertNFA(i1, 0, t) // epsilon + d.fromRegex(i0, i1, pattern[optionPos+1:]) + return + } + + // Handle concatenation + if concatPos > 0 { + i0 := d.newState() + i1 := d.newState() + d.insertNFA(i0, 0, i1) // epsilon + d.fromRegex(s, i0, pattern[:concatPos]) + d.fromRegex(i1, t, pattern[concatPos:]) + return + } + + // Handle postfix operators + lastChar := pattern[len(pattern)-1] + + if lastChar == '?' { + i0 := d.newState() + i1 := d.newState() + d.insertNFA(s, 0, i0) // epsilon + d.insertNFA(s, 0, t) // epsilon (skip) + d.insertNFA(i1, 0, t) // epsilon + d.fromRegex(i0, i1, pattern[:len(pattern)-1]) + return + } + + if lastChar == '*' { + i0 := d.newState() + i1 := d.newState() + d.insertNFA(s, 0, i0) // epsilon + d.insertNFA(s, 0, t) // epsilon (skip) + d.insertNFA(i1, 0, i0) // epsilon (loop) + d.insertNFA(i1, 0, t) // epsilon (exit) + d.fromRegex(i0, i1, pattern[:len(pattern)-1]) + return + } + + if lastChar == '+' { + i0 := d.newState() + i1 := d.newState() + d.insertNFA(i0, 0, i1) // epsilon + d.fromRegex(s, i0, pattern[:len(pattern)-1]) + + s = i1 + i0 = d.newState() + i1 = d.newState() + d.insertNFA(s, 0, i0) // epsilon + d.insertNFA(s, 0, t) // epsilon (skip) + d.insertNFA(i1, 0, i0) // epsilon (loop) + d.insertNFA(i1, 0, t) // epsilon (exit) + d.fromRegex(i0, i1, pattern[:len(pattern)-1]) + return + } + + // Must be wrapped in parentheses + if pattern[0] == '(' && pattern[len(pattern)-1] == ')' { + d.fromRegex(s, t, pattern[1:len(pattern)-1]) + return + } + + // Shouldn't reach here + panic(fmt.Sprintf("Unexpected pattern: %s", pattern)) +} + +// newState creates a new NFA state +func (d *DankEncoder) newState() int { + id := d.stateCounter + d.stateCounter++ + d.nfa = append(d.nfa, &NFAState{ + ID: id, + Trans: make(map[byte]map[int]bool), + }) + return id +} + +// insertNFA adds a transition +func (d *DankEncoder) insertNFA(from int, ch byte, to int) { + if d.nfa[from].Trans[ch] == nil { + d.nfa[from].Trans[ch] = make(map[int]bool) + } + d.nfa[from].Trans[ch][to] = true +} + +// epsilonClosure computes epsilon closure +func (d *DankEncoder) epsilonClosure(states map[int]bool) { + queue := []int{} + for s := range states { + queue = append(queue, s) + } + + for len(queue) > 0 { + state := queue[0] + queue = queue[1:] + + // Get epsilon transitions (ch = 0) + if epsTargets, ok := d.nfa[state].Trans[0]; ok { + for target := range epsTargets { + if !states[target] { + states[target] = true + queue = append(queue, target) + } + } + } + } +} + +// buildDFA performs subset construction +func (d *DankEncoder) buildDFA() { + d.dfaStates = make(map[string]int) + startKey := d.setKey(d.initStates) + d.dfaStates[startKey] = 0 + + initList := []int{} + for s := range d.initStates { + initList = append(initList, s) + } + sort.Ints(initList) + + d.dfa = []*DFAState{ + { + ID: 0, + NFAIDs: initList, + Trans: make(map[byte]int), + IsFinal: d.hasAcceptingState(d.initStates), + }, + } + + queue := []int{0} + + for len(queue) > 0 { + currID := queue[0] + queue = queue[1:] + curr := d.dfa[currID] + + currStates := make(map[int]bool) + for _, nid := range curr.NFAIDs { + currStates[nid] = true + } + + // Group transitions by character (skip epsilon = 0) + charMap := make(map[byte]map[int]bool) + for nfaID := range currStates { + for ch, targets := range d.nfa[nfaID].Trans { + if ch == 0 { + continue // Skip epsilon + } + if charMap[ch] == nil { + charMap[ch] = make(map[int]bool) + } + for target := range targets { + charMap[ch][target] = true + } + } + } + + // Process each character + for ch, moveSet := range charMap { + // Compute epsilon closure + d.epsilonClosure(moveSet) + + key := d.setKey(moveSet) + nextID, exists := d.dfaStates[key] + + if !exists { + nextID = len(d.dfa) + d.dfaStates[key] = nextID + + moveList := []int{} + for s := range moveSet { + moveList = append(moveList, s) + } + sort.Ints(moveList) + + newState := &DFAState{ + ID: nextID, + NFAIDs: moveList, + Trans: make(map[byte]int), + IsFinal: d.hasAcceptingState(moveSet), + } + d.dfa = append(d.dfa, newState) + queue = append(queue, nextID) + } + + curr.Trans[ch] = nextID + } + } + + // Add dead state (like Python's C++ implementation) + // The dead state is a non-final state with all transitions to itself + deadStateID := len(d.dfa) + deadState := &DFAState{ + ID: deadStateID, + NFAIDs: []int{}, // No NFA states + Trans: make(map[byte]int), + IsFinal: false, + } + // All missing transitions in other states should point to dead state + // And dead state transitions to itself + for _, ch := range d.alphabet { + deadState.Trans[ch] = deadStateID + } + d.dfa = append(d.dfa, deadState) + + // Update all states to have complete transition functions pointing to dead state + for _, state := range d.dfa[:deadStateID] { // Don't process dead state itself + for _, ch := range d.alphabet { + if _, exists := state.Trans[ch]; !exists { + state.Trans[ch] = deadStateID + } + } + } +} + +// hasAcceptingState checks if any NFA state in set is final +func (d *DankEncoder) hasAcceptingState(states map[int]bool) bool { + for s := range states { + if s < len(d.nfa) && d.nfa[s].IsFinal { + return true + } + } + return false +} + +// setKey creates a unique key for a set of states +func (d *DankEncoder) setKey(states map[int]bool) string { + ids := []int{} + for s := range states { + ids = append(ids, s) + } + sort.Ints(ids) + + strs := make([]string, len(ids)) + for i, id := range ids { + strs[i] = strconv.Itoa(id) + } + return strings.Join(strs, ",") +} + +// NumWords counts accepted strings using DP +func (d *DankEncoder) NumWords(minLen, maxLen int) int64 { + if maxLen > d.fixedSlice { + maxLen = d.fixedSlice + } + + var total big.Int + dp := make([]map[int]*big.Int, maxLen+1) + for i := range dp { + dp[i] = make(map[int]*big.Int) + } + dp[0][0] = big.NewInt(1) + + // Don't count dead state in DP + deadState := len(d.dfa) - 1 + + for l := 1; l <= maxLen; l++ { + for state, ways := range dp[l-1] { + // Iterate over actual transitions, not just alphabet + // (pattern may contain characters outside alphabet like *) + for _, next := range d.dfa[state].Trans { + // Skip transitions to dead state + if next == deadState { + continue + } + if _, has := dp[l][next]; !has { + dp[l][next] = big.NewInt(0) + } + dp[l][next].Add(dp[l][next], ways) + } + } + } + + for l := minLen; l <= maxLen; l++ { + for state, ways := range dp[l] { + if d.dfa[state].IsFinal { + total.Add(&total, ways) + } + } + } + + return total.Int64() +} + +// GenerateAtFixedLength returns all strings of exactly fixedLen +func (d *DankEncoder) GenerateAtFixedLength(fixedLen int) []string { + var results []string + d.dfsGenerateFixed(0, "", fixedLen, &results) + sort.Strings(results) + return results +} + +// dfsGenerateFixed generates only strings of exact length +func (d *DankEncoder) dfsGenerateFixed(state int, curr string, remaining int, results *[]string) { + // Skip dead state (last state in DFA) + deadState := len(d.dfa) - 1 + if state == deadState { + return + } + + if remaining == 0 { + if d.dfa[state].IsFinal { + *results = append(*results, curr) + } + return + } + + // Iterate over actual transitions (sorted for deterministic output) + // Can't just use alphabet because pattern may have characters outside alphabet (like *) + chars := []byte{} + for ch := range d.dfa[state].Trans { + chars = append(chars, ch) + } + sort.Slice(chars, func(i, j int) bool { return chars[i] < chars[j] }) + + for _, ch := range chars { + next := d.dfa[state].Trans[ch] + // Don't transition to dead state during generation + if next != deadState { + d.dfsGenerateFixed(next, curr+string(ch), remaining-1, results) + } + } +} + +// NumStates returns the number of DFA states +func (d *DankEncoder) NumStates() int { + return len(d.dfa) +} + +// NumNFAStates returns the number of NFA states (for debugging) +func (d *DankEncoder) NumNFAStates() int { + return len(d.nfa) +} + +// reverseDFA converts the current DFA back to an NFA with reversed transitions +// This is part of Brzozowski's algorithm for DFA minimization +func (d *DankEncoder) reverseDFA() { + // Create new NFA with same number of states as DFA + newNFA := make([]*NFAState, len(d.dfa)) + for i := range newNFA { + newNFA[i] = &NFAState{ + ID: i, + Trans: make(map[byte]map[int]bool), + IsFinal: false, + } + } + + // The old init state becomes the only final state + newNFA[0].IsFinal = true + + // Reverse all transitions + for _, state := range d.dfa { + for ch, target := range state.Trans { + // Add reverse transition: target --ch--> state.ID + if newNFA[target].Trans[ch] == nil { + newNFA[target].Trans[ch] = make(map[int]bool) + } + newNFA[target].Trans[ch][state.ID] = true + } + } + + // Old final states become new init states + newInitStates := make(map[int]bool) + for _, state := range d.dfa { + if state.IsFinal { + newInitStates[state.ID] = true + } + } + + // Replace NFA + d.nfa = newNFA + d.initStates = newInitStates + d.stateCounter = len(newNFA) + + // Compute epsilon closure of init states (no epsilon transitions after reverse, but for consistency) + d.epsilonClosure(d.initStates) +} diff --git a/internal/patternmining/clustering.go b/internal/patternmining/clustering.go new file mode 100644 index 00000000..66359123 --- /dev/null +++ b/internal/patternmining/clustering.go @@ -0,0 +1,52 @@ +package patternmining + +import "sort" + +// editClosures computes edit-distance closures for clustering +func (m *Miner) editClosures(items []string, delta int) [][]string { + var ret [][]string + for _, a := range items { + rSet := make(map[string]bool) + rSet[a] = true + for _, b := range items { + d := m.getDist(a, b) + if d < delta { + rSet[b] = true + } + } + r := []string{} + for k := range rSet { + r = append(r, k) + } + found := false + for _, s := range ret { + if sameSlices(r, s) { + found = true + break + } + } + if !found { + ret = append(ret, r) + } + } + return ret +} + +func sameSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + sa := make([]string, len(a)) + copy(sa, a) + sb := make([]string, len(b)) + copy(sb, b) + sort.Strings(sa) + sort.Strings(sb) + for i := range sa { + if sa[i] != sb[i] { + return false + } + } + return true +} + diff --git a/internal/patternmining/patternmining.go b/internal/patternmining/patternmining.go new file mode 100644 index 00000000..b35a47cc --- /dev/null +++ b/internal/patternmining/patternmining.go @@ -0,0 +1,573 @@ +package patternmining + +// Pattern Mining for Subdomain Discovery +// This is a Go port of Regulator by @cramppet (https://github.com/cramppet/regulator) +// Regulator uses edit-distance clustering and regex generalization to discover +// subdomain patterns from observed data. + +import ( + "encoding/json" + "fmt" + "os" + "regexp" + "sort" + "strings" + + "github.com/projectdiscovery/alterx/internal/dank" + "github.com/projectdiscovery/gologger" +) + +var ( + reNum = regexp.MustCompile(`[0-9]+`) + reNumeric = regexp.MustCompile(`^[0-9]+$`) + reDoubleDot = regexp.MustCompile(`\.{2,}`) +) + +// Options contains pattern mining configuration +type Options struct { + // Input domains to analyze + Domains []string + // Target domain (e.g., "example.com") + Target string + // MinDistance is the minimum levenshtein distance for clustering + MinDistance int + // MaxDistance is the maximum levenshtein distance for clustering + MaxDistance int + // PatternThreshold is the threshold for pattern quality filtering + PatternThreshold int + // QualityRatio is the maximum ratio of synthetic/observed for pattern validation + QualityRatio float64 + // MaxLength is the maximum pattern length + MaxLength int + // NgramsLimit limits the number of n-grams to process (0 = no limit) + NgramsLimit int +} + +// Result contains discovered patterns and metadata +type Result struct { + Patterns []string + Metadata map[string]map[string]interface{} +} + +// PatternMetadata contains metadata about a discovered pattern +type PatternMetadata struct { + Mode string `json:"mode"` + K *int `json:"k,omitempty"` + Ngram string `json:"ngram,omitempty"` + Prefix string `json:"prefix,omitempty"` + ClusterSize int `json:"cluster_size"` + Nwords int `json:"nwords"` + Ratio float64 `json:"ratio"` + Members []string `json:"members,omitempty"` +} + +// RuleEntry represents a pattern with its metadata +type RuleEntry struct { + Pattern string `json:"pattern"` + Meta *PatternMetadata `json:"meta"` +} + +// StepGroup represents a mining step with its discovered patterns +type StepGroup struct { + Step string `json:"step"` + Entries []*RuleEntry `json:"entries"` +} + +// RulesOutput is the top-level structure for saved rules +type RulesOutput struct { + Steps []*StepGroup `json:"steps"` +} + +// Miner performs pattern mining on subdomain data +type Miner struct { + opts *Options + memo map[string]int // memoized levenshtein distances +} + +// NewMiner creates a new pattern miner +func NewMiner(opts *Options) *Miner { + return &Miner{ + opts: opts, + memo: make(map[string]int), + } +} + +// Mine discovers patterns from input domains +func (m *Miner) Mine() (*Result, error) { + if len(m.opts.Domains) == 0 { + return nil, fmt.Errorf("no domains provided for pattern mining") + } + if m.opts.Target == "" { + return nil, fmt.Errorf("target domain not specified") + } + + gologger.Info().Msgf("Starting pattern mining on %d observations", len(m.opts.Domains)) + + // Validate and filter domains + knownHosts := m.validateDomains() + if len(knownHosts) == 0 { + return nil, fmt.Errorf("no valid domains after filtering") + } + + gologger.Verbose().Msgf("Building pairwise distance table...") + m.buildDistanceTable(knownHosts) + + newRules := make(map[string]map[string]interface{}) + + // Phase 1: No enforced prefix - edit distance clustering + gologger.Verbose().Msgf("Phase 1: Edit distance clustering...") + for k := m.opts.MinDistance; k < m.opts.MaxDistance; k++ { + closures := m.editClosures(knownHosts, k) + for _, closure := range closures { + if len(closure) <= 1 { + continue + } + pattern, _ := m.closureToRegex(false, closure) + if len(pattern) > m.opts.MaxLength { + continue + } + subdomainPattern := strings.TrimSuffix(pattern, "."+m.opts.Target) + if m.isGoodRule(subdomainPattern, len(closure)) { + nwords := dank.NewDankEncoder(m.preparePattern(subdomainPattern), 256).NumWords(1, 256) + ratio := float64(0) + if len(closure) > 0 { + ratio = float64(nwords) / float64(len(closure)) + } + if _, exists := newRules[pattern]; !exists { + newRules[pattern] = map[string]interface{}{ + "mode": "no_prefix", + "k": k, + "cluster_size": len(closure), + "nwords": nwords, + "ratio": ratio, + "members": closure, + } + } + } + } + } + + // Phase 2: N-gram prefix clustering + gologger.Verbose().Msgf("Phase 2: N-gram prefix clustering...") + ngrams := m.generateNgrams(m.opts.NgramsLimit) + + for _, ngram := range ngrams { + keys := m.prefixKeys(knownHosts, ngram) + if len(keys) == 0 { + continue + } + + // Try ngram as simple prefix + rUn, _ := m.closureToRegex(false, keys) + rEsc, _ := m.closureToRegex(true, keys) + if m.isGoodRule(rEsc, len(keys)) { + nwords := dank.NewDankEncoder(m.preparePattern(rEsc), 256).NumWords(1, 256) + ratio := float64(0) + if len(keys) > 0 { + ratio = float64(nwords) / float64(len(keys)) + } + if _, exists := newRules[rUn]; !exists { + newRules[rUn] = map[string]interface{}{ + "mode": "ngram", + "ngram": ngram, + "cluster_size": len(keys), + "nwords": nwords, + "ratio": ratio, + "members": keys, + } + } + } + + // Try with first token as prefix + prefixes := m.extractFirstTokens(keys) + last := "" + for _, prefix := range prefixes { + keys2 := m.prefixKeys(knownHosts, prefix) + rUn, _ := m.closureToRegex(false, keys2) + rEsc, _ := m.closureToRegex(true, keys2) + + if m.isGoodRule(rEsc, len(keys2)) { + // Avoid redundant prefixes + if last == "" || !strings.HasPrefix(prefix, last) { + last = prefix + } else { + continue + } + + nwords := dank.NewDankEncoder(m.preparePattern(rEsc), 256).NumWords(1, 256) + ratio := float64(0) + if len(keys2) > 0 { + ratio = float64(nwords) / float64(len(keys2)) + } + if _, exists := newRules[rUn]; !exists { + newRules[rUn] = map[string]interface{}{ + "mode": "ngram_prefix", + "ngram": ngram, + "prefix": prefix, + "cluster_size": len(keys2), + "nwords": nwords, + "ratio": ratio, + "members": keys2, + } + } + } + + // Apply edit distance clustering within prefix group + if len(prefix) > 1 { + for kk := m.opts.MinDistance; kk < m.opts.MaxDistance; kk++ { + closures := m.editClosures(keys2, kk) + for _, closure := range closures { + rUn, _ := m.closureToRegex(false, closure) + rEsc, _ := m.closureToRegex(true, closure) + + if m.isGoodRule(rEsc, len(closure)) { + nwords := dank.NewDankEncoder(m.preparePattern(rEsc), 256).NumWords(1, 256) + ratio := float64(0) + if len(closure) > 0 { + ratio = float64(nwords) / float64(len(closure)) + } + if _, exists := newRules[rUn]; !exists { + newRules[rUn] = map[string]interface{}{ + "mode": "ngram_prefix", + "ngram": ngram, + "prefix": prefix, + "k": kk, + "cluster_size": len(closure), + "nwords": nwords, + "ratio": ratio, + "members": closure, + } + } + } + } + } + } + } + } + + patterns := make([]string, 0, len(newRules)) + for pattern := range newRules { + patterns = append(patterns, pattern) + } + sort.Strings(patterns) + + gologger.Info().Msgf("Discovered %d unique patterns", len(patterns)) + + return &Result{ + Patterns: patterns, + Metadata: newRules, + }, nil +} + +// validateDomains filters and validates input domains +func (m *Miner) validateDomains() []string { + var knownHosts []string + for _, host := range m.opts.Domains { + host = strings.TrimSpace(host) + if host == "" || host == m.opts.Target { + continue + } + if !strings.HasSuffix(host, "."+m.opts.Target) { + gologger.Verbose().Msgf("Rejecting malformed input: %s", host) + continue + } + // Validate tokenization + tokens := m.tokenize([]string{host}) + if len(tokens) == 0 || len(tokens[0]) == 0 || len(tokens[0][0]) == 0 { + gologger.Verbose().Msgf("Rejecting malformed input: %s", host) + continue + } + knownHosts = append(knownHosts, host) + } + return m.removeDuplicatesAndSort(knownHosts) +} + +// buildDistanceTable computes all pairwise levenshtein distances +func (m *Miner) buildDistanceTable(hosts []string) { + for i := 0; i < len(hosts); i++ { + for j := i; j < len(hosts); j++ { + d := levenshtein(hosts[i], hosts[j]) + key := getKey(hosts[i], hosts[j]) + m.memo[key] = d + } + } +} + +// generateNgrams creates unigrams and bigrams +func (m *Miner) generateNgrams(limit int) []string { + dnsChars := "abcdefghijklmnopqrstuvwxyz0123456789._-" + ngrams := []string{} + + // Unigrams + for _, c := range dnsChars { + ngrams = append(ngrams, string(c)) + } + + // Bigrams + for _, c1 := range dnsChars { + for _, c2 := range dnsChars { + ngrams = append(ngrams, string(c1)+string(c2)) + } + } + + sort.Strings(ngrams) + + if limit > 0 && len(ngrams) > limit { + return ngrams[:limit] + } + return ngrams +} + +// SaveRules writes discovered patterns and metadata to a single JSON file +func (m *Miner) SaveRules(result *Result, filename string) error { + f, err := os.Create(filename) + if err != nil { + return err + } + defer f.Close() + + // Group patterns by step with metadata + grouped := m.groupRulesByStep(result.Metadata) + + // Write as compact JSON + enc := json.NewEncoder(f) + return enc.Encode(grouped) +} + +// GenerateFromPatterns generates subdomains from discovered patterns +func (m *Miner) GenerateFromPatterns(patterns []string) []string { + var results []string + seen := make(map[string]bool) + + for _, pattern := range patterns { + subdomainPattern := strings.TrimSuffix(pattern, "."+m.opts.Target) + if len(subdomainPattern) == 0 { + continue + } + + // Calculate fixed length for generation + tempEncoder := dank.NewDankEncoder(m.preparePattern(subdomainPattern), 1) + fixedSlice := tempEncoder.NumStates() - 2 + if fixedSlice < 0 { + fixedSlice = 0 + } + + encoder := dank.NewDankEncoder(m.preparePattern(subdomainPattern), fixedSlice) + generated := encoder.GenerateAtFixedLength(fixedSlice) + + for _, item := range generated { + fullHost := item + "." + m.opts.Target + // Remove double dots + fullHost = reDoubleDot.ReplaceAllString(fullHost, ".") + if !seen[fullHost] && fullHost != "" { + seen[fullHost] = true + results = append(results, fullHost) + } + } + } + + sort.Strings(results) + return results +} + +// Helper functions + +func (m *Miner) isGoodRule(regex string, nkeys int) bool { + encoder := dank.NewDankEncoder(m.preparePattern(regex), 256) + nwords := encoder.NumWords(1, 256) + if nwords < int64(m.opts.PatternThreshold) { + return true + } + if nkeys == 0 { + return false + } + return float64(nwords)/float64(nkeys) < m.opts.QualityRatio +} + +func (m *Miner) preparePattern(p string) string { + return escapeForDankEncoder(p) +} + +func (m *Miner) removeDuplicatesAndSort(hosts []string) []string { + seen := make(map[string]bool) + for _, h := range hosts { + seen[h] = true + } + res := make([]string, 0, len(seen)) + for k := range seen { + res = append(res, k) + } + sort.Strings(res) + return res +} + +func (m *Miner) prefixKeys(hosts []string, pre string) []string { + var res []string + for _, h := range hosts { + if strings.HasPrefix(h, pre) { + res = append(res, h) + } + } + return res +} + +func (m *Miner) extractFirstTokens(keys []string) []string { + firstTokens := make(map[string]bool) + for _, k := range keys { + ft := m.firstToken(k) + if ft != "" { + firstTokens[ft] = true + } + } + var prefixes []string + for ft := range firstTokens { + prefixes = append(prefixes, ft) + } + sort.Strings(prefixes) + return prefixes +} + +func (m *Miner) firstToken(host string) string { + tokens := m.tokenize([]string{host}) + if len(tokens) == 0 || len(tokens[0]) == 0 || len(tokens[0][0]) == 0 { + return "" + } + return tokens[0][0][0] +} + +func (m *Miner) groupRulesByStep(rules map[string]map[string]interface{}) *RulesOutput { + groups := map[string][]*RuleEntry{ + "no_prefix": {}, + "ngram": {}, + "ngram_prefix": {}, + } + stepNames := []string{"no_prefix", "ngram", "ngram_prefix"} + + for pattern, meta := range rules { + mode, ok := meta["mode"].(string) + if !ok { + continue + } + + // Build typed metadata struct + patternMeta := &PatternMetadata{ + Mode: mode, + } + + // Extract optional fields with type assertions + if k, ok := meta["k"].(int); ok { + patternMeta.K = &k + } + if ngram, ok := meta["ngram"].(string); ok { + patternMeta.Ngram = ngram + } + if prefix, ok := meta["prefix"].(string); ok { + patternMeta.Prefix = prefix + } + if clusterSize, ok := meta["cluster_size"].(int); ok { + patternMeta.ClusterSize = clusterSize + } + if nwords, ok := meta["nwords"].(int); ok { + patternMeta.Nwords = nwords + } + if ratio, ok := meta["ratio"].(float64); ok { + patternMeta.Ratio = ratio + } + // Note: members field is intentionally excluded from output + + entry := &RuleEntry{ + Pattern: pattern, + Meta: patternMeta, + } + groups[mode] = append(groups[mode], entry) + } + + // Build ordered steps + steps := make([]*StepGroup, 0, len(stepNames)) + for _, step := range stepNames { + entries := groups[step] + sort.Slice(entries, func(a, b int) bool { + return entries[a].Pattern < entries[b].Pattern + }) + stepGroup := &StepGroup{ + Step: step, + Entries: entries, + } + steps = append(steps, stepGroup) + } + + return &RulesOutput{ + Steps: steps, + } +} + +// getDist retrieves memoized distance +func (m *Miner) getDist(a, b string) int { + key := getKey(a, b) + if d, ok := m.memo[key]; ok { + return d + } + return 999999 +} + +func getKey(a, b string) string { + if strings.Compare(a, b) < 0 { + return a + "\x00" + b + } + return b + "\x00" + a +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +func levenshtein(s1, s2 string) int { + if len(s1) == 0 { + return len(s2) + } + if len(s2) == 0 { + return len(s1) + } + m := make([]int, len(s2)+1) + for i := range m { + m[i] = i + } + for i := 1; i <= len(s1); i++ { + curr := make([]int, len(s2)+1) + curr[0] = i + for j := 1; j <= len(s2); j++ { + cost := 0 + if s1[i-1] != s2[j-1] { + cost = 1 + } + curr[j] = min(curr[j-1]+1, min(m[j]+1, m[j-1]+cost)) + } + m = curr + } + return m[len(s2)] +} + +func escapeForDankEncoder(pattern string) string { + var result strings.Builder + prevWasOp := true + + for _, c := range pattern { + if c == '(' || c == '|' { + result.WriteRune(c) + prevWasOp = true + } else if c == ')' { + result.WriteRune(c) + prevWasOp = false + } else if c == '*' && prevWasOp { + result.WriteString("\\*") + prevWasOp = false + } else { + result.WriteRune(c) + prevWasOp = false + } + } + return result.String() +} diff --git a/internal/patternmining/regex.go b/internal/patternmining/regex.go new file mode 100644 index 00000000..12af0872 --- /dev/null +++ b/internal/patternmining/regex.go @@ -0,0 +1,355 @@ +package patternmining + +import ( + "fmt" + "sort" + "strconv" + "strings" +) + +// tokenize breaks down hostnames into structured tokens +func (m *Miner) tokenize(items []string) [][][]string { + var ret [][][]string + for _, item := range items { + subdomain := strings.TrimSuffix(item, "."+m.opts.Target) + labelsStr := strings.Split(subdomain, ".") + var n [][]string + for _, labelStr := range labelsStr { + var t []string + parts := strings.Split(labelStr, "-") + var tokens []string + for i, p := range parts { + if i == 0 { + tokens = append(tokens, p) + } else { + tokens = append(tokens, "-"+p) + } + } + for _, token := range tokens { + subt := splitNum(token) + var tt []string + for ii := 0; ii < len(subt); ii++ { + if subt[ii] == "-" && ii+1 < len(subt) { + subt[ii+1] = "-" + subt[ii+1] + } else { + tt = append(tt, subt[ii]) + } + } + t = append(t, tt...) + } + n = append(n, t) + } + ret = append(ret, n) + } + return ret +} + +// splitNum splits strings by numeric sequences +func splitNum(s string) []string { + loc := reNum.FindAllStringIndex(s, -1) + var res []string + start := 0 + for _, l := range loc { + if l[0] > start { + res = append(res, s[start:l[0]]) + } + res = append(res, s[l[0]:l[1]]) + start = l[1] + } + if start < len(s) { + res = append(res, s[start:]) + } + var ne []string + for _, p := range res { + if p != "" { + ne = append(ne, p) + } + } + return ne +} + +// closureToRegex generates a regex pattern from a cluster of similar hosts +func (m *Miner) closureToRegex(escaped bool, members []string) (string, int64) { + if len(members) == 0 { + return "", 0 + } + tokens := m.tokenize(members) + var maxLevel int + for _, memTokens := range tokens { + for i := range memTokens { + if i > maxLevel { + maxLevel = i + } + } + } + optional := make(map[int]map[int][]string) + levels := make(map[int]map[int]map[string]bool) + for _, memTokens := range tokens { + for i := range memTokens { + if _, ok := optional[i]; !ok { + optional[i] = make(map[int][]string) + } + if _, ok := levels[i]; !ok { + levels[i] = make(map[int]map[string]bool) + } + for j := 0; j < len(memTokens[i]); j++ { + optional[i][j] = append(optional[i][j], memTokens[i][j]) + } + for j, token := range memTokens[i] { + if _, ok := levels[i][j]; !ok { + levels[i][j] = make(map[string]bool) + } + levels[i][j][token] = true + } + } + } + numMembers := len(members) + var ret strings.Builder + for i := 0; i <= maxLevel; i++ { + if _, ok := levels[i]; !ok { + continue + } + var poss []int + for j := range levels[i] { + poss = append(poss, j) + } + sort.Ints(poss) + isLevel0 := i == 0 + var n strings.Builder + if !isLevel0 { + n.WriteString("(.") + } + for _, j := range poss { + var toks []string + for tk := range levels[i][j] { + toks = append(toks, tk) + } + sort.Strings(toks) + var alt string + if len(toks) == 0 { + continue + } + alt = strings.Join(toks, "|") + isOptPos := len(optional[i][j]) != numMembers + if isLevel0 && j == 0 { + n.WriteString("(" + alt + ")") + } else if len(toks) == 1 && j == 0 { + n.WriteString(alt) + } else { + q := "" + if isOptPos { + q = "?" + } + n.WriteString("(" + alt + ")" + q) + } + } + // Calculate optional level + var posLists [][]string + for _, j := range poss { + posLists = append(posLists, optional[i][j]) + } + minLen := numMembers + for _, pl := range posLists { + if len(pl) < minLen { + minLen = len(pl) + } + } + valueSet := make(map[string]bool) + for kk := 0; kk < minLen; kk++ { + var sb strings.Builder + for _, pl := range posLists { + if kk < len(pl) { + sb.WriteString(pl[kk]) + } + } + valueSet[sb.String()] = true + } + isOptionalLevel := len(valueSet) != 1 || minLen != numMembers + if isLevel0 { + ret.WriteString(n.String()) + } else { + ret.WriteString(n.String()) + if isOptionalLevel { + ret.WriteString(")?") + } else { + ret.WriteString(")") + } + } + } + var full strings.Builder + full.WriteString(ret.String()) + if escaped { + full.WriteString(`\.`) + full.WriteString(escapeLiteral(m.opts.Target)) + } else { + full.WriteString(".") + full.WriteString(m.opts.Target) + } + r := full.String() + compressed := m.compressNumberRanges(r) + return compressed, 0 +} + +// escapeLiteral escapes regex special characters +func escapeLiteral(s string) string { + var sb strings.Builder + for _, c := range s { + switch c { + case '.', '^', '$', '*', '+', '?', '(', ')', '[', '{', '}', '|', '\\': + sb.WriteRune('\\') + fallthrough + default: + sb.WriteRune(c) + } + } + return sb.String() +} + +// compressNumberRanges optimizes number sequences in regex patterns +func (m *Miner) compressNumberRanges(regex string) string { + repl := make(map[string]string) + extraM := make(map[string]string) + hyphenM := make(map[string]bool) + var stack []int + i := 0 + for i < len(regex) { + if regex[i] == '(' { + stack = append(stack, i) + i++ + continue + } + if regex[i] == ')' && len(stack) > 0 { + start := stack[len(stack)-1] + stack = stack[:len(stack)-1] + group := regex[start+1 : i] + if strings.ContainsAny(group, "(?)") { + i++ + continue + } + tokens := strings.Split(group, "|") + var numbers []string + var nonnumbers []string + var hyphenated []string + for _, token := range tokens { + token = strings.TrimSpace(token) + if token == "" { + continue + } + if reNumeric.MatchString(token) { + numbers = append(numbers, token) + } else if strings.HasPrefix(token, "-") && reNumeric.MatchString(token[1:]) { + hyphenated = append(hyphenated, token[1:]) + } else { + nonnumbers = append(nonnumbers, token) + } + } + if len(numbers) > 0 && len(hyphenated) > 0 { + i++ + continue + } + if len(numbers) <= 1 && len(hyphenated) <= 1 { + i++ + continue + } + g1 := "" + g2 := strings.Join(nonnumbers, "|") + if len(numbers) > 1 { + g1 = strings.Join(numbers, "|") + } else { + g1 = strings.Join(hyphenated, "|") + } + fullGroup := "(" + group + ")" + repl[g1] = fullGroup + extraM[g1] = g2 + hyphenM[g1] = len(hyphenated) > 1 + i++ + continue + } + i++ + } + ret := regex + var keys []string + for k := range repl { + keys = append(keys, k) + } + sort.Strings(keys) + for _, group := range keys { + generalized := "(" + if hyphenM[group] { + generalized = "(-" + } + positions := make(map[int]map[int]bool) + toks := strings.Split(group, "|") + var revToks []string + for _, g := range toks { + rs := reverseString(g) + revToks = append(revToks, rs) + } + for _, token := range revToks { + for position := 0; position < len(token); position++ { + symbolStr := string(token[position]) + symbol, _ := strconv.Atoi(symbolStr) + if _, ok := positions[position]; !ok { + positions[position] = make(map[int]bool) + } + positions[position][symbol] = true + } + } + s := revToks + sort.Slice(s, func(p, q int) bool { return len(s[p]) < len(s[q]) }) + start := len(s[len(s)-1]) - 1 + end := len(s[0]) - 1 + for ii := start; ii > end; ii-- { + if _, ok := positions[ii]; !ok { + positions[ii] = make(map[int]bool) + } + positions[ii][-1] = true + } + var possPos []int + for p := range positions { + possPos = append(possPos, p) + } + sort.Ints(possPos) + for kk := len(possPos) - 1; kk >= 0; kk-- { + pos := possPos[kk] + symbolsMap := positions[pos] + hasNone := symbolsMap[-1] + delete(symbolsMap, -1) + var symbols []int + for k := range symbolsMap { + symbols = append(symbols, k) + } + sort.Ints(symbols) + if len(symbols) == 0 { + continue + } + startS := symbols[0] + endS := symbols[len(symbols)-1] + if startS == endS { + generalized += strconv.Itoa(startS) + } else { + generalized += fmt.Sprintf("[%d-%d]", startS, endS) + } + if hasNone { + generalized += "?" + } + } + generalized += ")" + ext := extraM[group] + if ext != "" { + generalized = "(" + generalized + "|(" + ext + "))" + } + rep := repl[group] + ret = strings.ReplaceAll(ret, rep, generalized) + } + return ret +} + +func reverseString(s string) string { + runes := []rune(s) + for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { + runes[i], runes[j] = runes[j], runes[i] + } + return string(runes) +} + diff --git a/internal/runner/runner.go b/internal/runner/runner.go index 51668c92..60f030ce 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -28,6 +28,13 @@ type Options struct { Enrich bool Limit int MaxSize int + Mode string + MinDistance int + MaxDistance int + PatternThreshold int + QualityRatio int + NgramsLimit int + SaveRules string // internal/unexported fields wordlists goflags.RuntimeMap } @@ -60,6 +67,16 @@ func ParseFlags() *Options { flagSet.IntVar(&opts.Limit, "limit", 0, "limit the number of results to return (default 0)"), ) + flagSet.CreateGroup("pattern-mining", "Pattern Mining", + flagSet.StringVarP(&opts.Mode, "mode", "m", "default", "pattern mode: 'default' (user/default patterns), 'discover' (mined only), 'both' (combined)"), + flagSet.IntVar(&opts.MinDistance, "min-distance", 2, "minimum levenshtein distance for clustering"), + flagSet.IntVar(&opts.MaxDistance, "max-distance", 10, "maximum levenshtein distance for clustering"), + flagSet.IntVar(&opts.PatternThreshold, "pattern-threshold", 500, "pattern threshold for filtering low-quality patterns"), + flagSet.IntVar(&opts.QualityRatio, "quality-ratio", 25, "pattern quality ratio threshold (synthetic/observed ratio)"), + flagSet.IntVar(&opts.NgramsLimit, "ngrams-limit", 0, "limit number of n-grams to process (0 = no limit)"), + flagSet.StringVar(&opts.SaveRules, "save-rules", "", "save discovered patterns and metadata to JSON file (only works with discover/both modes)"), + ) + flagSet.CreateGroup("update", "Update", flagSet.CallbackVarP(GetUpdateCallback(), "update", "up", "update alterx to latest version"), flagSet.BoolVarP(&opts.DisableUpdateCheck, "disable-update-check", "duc", false, "disable automatic alterx update check"), From 0217a3a06c86b29c9e4f36d0ae1d037949904359 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Mon, 10 Nov 2025 11:35:38 +0530 Subject: [PATCH 2/4] complete implementation --- .gitignore | 3 +- CLAUDE.md | 204 ++++++++++++++++++++++++ Makefile | 74 +++++++++ README.md | 27 ++++ cmd/alterx/main.go | 133 +++++---------- dedupe_writer.go | 143 +++++++++++++++++ dedupe_writer_test.go | 126 +++++++++++++++ internal/patternmining/patternmining.go | 27 ++++ 8 files changed, 642 insertions(+), 95 deletions(-) create mode 100644 CLAUDE.md create mode 100644 Makefile create mode 100644 dedupe_writer.go create mode 100644 dedupe_writer_test.go diff --git a/.gitignore b/.gitignore index f7b81002..cf399397 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ *.dll *.so *.dylib -cmd/alterx/alterx dist .idea .vscode @@ -17,3 +16,5 @@ dist # Dependency directories (remove the comment below to include it) # vendor/ +/cmd/alterx/alterx +/alterx diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..9866ef7a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,204 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +AlterX is a fast and customizable subdomain wordlist generator using DSL (Domain-Specific Language). This is a **Go port** that integrates pattern mining capabilities from [Regulator](https://github.com/cramppet/regulator) by @cramppet into the original [ProjectDiscovery alterx](https://github.com/projectdiscovery/alterx). + +**Key Features:** +- Template-based subdomain generation using variables like `{{sub}}`, `{{suffix}}`, `{{word}}` +- Pattern mining mode that automatically discovers subdomain patterns from observed data +- Three operation modes: default (user patterns), discover (mined patterns), both (combined) +- ClusterBomb attack pattern for generating permutations + +## Build & Development Commands + +```bash +# Build the binary +make build + +# Run tests +make test + +# Run tests with coverage +make test-coverage + +# Run linter (requires golangci-lint) +make lint + +# Format code +make fmt + +# Clean build artifacts +make clean + +# Install to $GOPATH/bin +make install + +# Build and run help +make run +``` + +**Single test execution:** +```bash +go test -v -run TestFunctionName ./path/to/package +``` + +## Architecture + +### Core Components + +**1. Entry Point** (`cmd/alterx/main.go`) +- CLI argument parsing via `runner.ParseFlags()` +- Mode selection logic (default/discover/both) +- Pattern mining flow orchestration +- Deduplication between mined and user-defined patterns + +**2. Mutator Engine** (`mutator.go`, `algo.go`) +- `Mutator` struct: Core permutation generator +- `ClusterBomb` algorithm: Nth-order payload combination using recursion +- `IndexMap`: Maintains deterministic ordering for payload iteration +- Template replacement using variables extracted from input domains + +**3. Input Processing** (`inputs.go`) +- `Input` struct: Parses domains into components (sub, suffix, tld, etld, etc.) +- Variable extraction: `{{sub}}`, `{{sub1}}`, `{{suffix}}`, `{{root}}`, `{{sld}}`, etc. +- Multi-level subdomain support (e.g., `cloud.api.example.com` → `sub=cloud`, `sub1=api`) + +**4. Pattern Mining** (`internal/patternmining/`) +- **Three-phase discovery algorithm:** + 1. Edit distance clustering (no prefix enforcement) + 2. N-gram clustering (unigrams/bigrams) + 3. N-gram prefix clustering with edit distance refinement +- **Quality control:** Pattern threshold and quality ratio prevent over-generation +- **Regex generation:** Converts clusters to patterns with alternations `(a|b)` and optional groups `(...)?` +- **Number compression:** Optimizes `[0-9]` ranges automatically + +**5. DFA Engine** (`internal/dank/dank.go`) +- Brzozowski's algorithm for DFA minimization +- Thompson NFA construction from regex +- Subset construction for NFA→DFA conversion +- Reverse DFA for minimization (determinize → reverse → determinize → reverse → determinize) +- Fixed-length string generation from automaton + +### File Structure + +``` +cmd/alterx/main.go # Entry point, mode selection, orchestration +internal/runner/ + ├── runner.go # CLI flag definitions and parsing + ├── config.go # Version and config management + └── banner.go # Banner display +internal/patternmining/ + ├── patternmining.go # Main mining algorithm (3 phases) + ├── clustering.go # Edit distance clustering logic + └── regex.go # Tokenization and regex generation +internal/dank/ + └── dank.go # DFA-based pattern generation (Brzozowski) +mutator.go # Core Mutator with ClusterBomb algorithm +algo.go # ClusterBomb implementation and IndexMap +inputs.go # Domain parsing and variable extraction +replacer.go # Template variable replacement +config.go # Default patterns and payloads +util.go # Helper functions +``` + +## Key Concepts + +### Variables System +Templates use variables extracted from input domains: +- `{{sub}}`: Leftmost subdomain part (e.g., `api` in `api.example.com`) +- `{{suffix}}`: Everything except leftmost part (e.g., `example.com`) +- `{{root}}`: eTLD+1 (e.g., `example.com`) +- `{{sld}}`: Second-level domain (e.g., `example`) +- `{{tld}}`: Top-level domain (e.g., `com`) +- `{{etld}}`: Extended TLD (e.g., `co.uk`) +- `{{subN}}`: Multi-level support where N is depth (e.g., `{{sub1}}`, `{{sub2}}`) + +### ClusterBomb Algorithm +Generates all combinations of payloads across variables: +- Uses recursion with vector construction +- Maintains deterministic ordering via IndexMap +- Avoids redundant combinations (e.g., `api-api.example.com`) +- Early exit when no variables present in template + +### Pattern Mining Workflow +1. **Validate input:** Ensure domains share common target (e.g., `.example.com`) +2. **Build distance table:** Compute pairwise Levenshtein distances +3. **Phase 1 - Edit clustering:** Group by edit distance (min to max) +4. **Phase 2 - N-grams:** Generate unigrams/bigrams, cluster by prefix +5. **Phase 3 - Prefix clustering:** Apply edit distance within prefix groups +6. **Quality validation:** Filter patterns using threshold and ratio metrics +7. **Generate subdomains:** Use DFA to produce strings from patterns + +## Pattern Mining Modes + +**Default Mode** (`-m default` or omit): +- Original alterx behavior +- Uses user-defined or default patterns from config + +**Discover Mode** (`-m discover`): +- Pattern mining only +- Discovers patterns from input domains +- Generates subdomains based only on mined patterns + +**Both Mode** (`-m both`): +- Combines user-defined and mined patterns +- Deduplicates results across both sources +- Best for maximum coverage + +**Key Flags:** +- `-min-distance 2`: Minimum Levenshtein distance for clustering +- `-max-distance 10`: Maximum Levenshtein distance for clustering +- `-pattern-threshold 500`: Minimum synthetic subdomains before ratio check +- `-quality-ratio 25`: Max ratio of synthetic/observed subdomains +- `-save-rules output.json`: Save discovered patterns and metadata to JSON file + +## Common Patterns + +### Adding New CLI Flags +1. Add field to `Options` struct in `internal/runner/runner.go` +2. Register flag in `ParseFlags()` using appropriate flag group +3. Handle flag value in main logic (`cmd/alterx/main.go`) + +### Adding New Variables +1. Parse in `NewInput()` in `inputs.go` +2. Add to `Input.GetMap()` return value +3. Update template validation in `mutator.go` + +### Modifying Pattern Mining +- **Clustering logic:** `internal/patternmining/clustering.go` +- **Tokenization rules:** `tokenize()` in `internal/patternmining/regex.go` +- **Quality metrics:** `isGoodRule()` in `internal/patternmining/patternmining.go` + +## Testing Strategy + +- Unit tests in `*_test.go` files (e.g., `mutator_test.go`, `inputs_test.go`) +- Test individual components before integration +- Use table-driven tests for variable extraction and pattern generation +- Validate pattern mining with known domain sets + +## Important Notes + +- **Dedupe enabled by default:** `DedupeResults = true` in `mutator.go` +- **Prefix optimization:** ClusterBomb skips words already in leftmost subdomain +- **Pattern quality critical:** Low thresholds generate millions of subdomains +- **Distance memoization:** Pattern mining caches Levenshtein distances for performance +- **DFA minimization:** Three-pass Brzozowski ensures minimal automaton +- **No breaking changes:** All pattern mining is additive; default behavior unchanged + +## Credits + +- **Original alterx:** [ProjectDiscovery](https://github.com/projectdiscovery/alterx) +- **Pattern mining algorithm:** [Regulator](https://github.com/cramppet/regulator) by @cramppet +- **DFA implementation:** Ported from original regulator/dank library + +## Development Guidelines + +- Maintain compatibility with original alterx API +- Keep pattern mining as optional feature (don't force on users) +- Preserve deterministic output ordering for testing +- Use `gologger` for all logging (not fmt.Println) +- Follow Go naming conventions and project structure +- Add tests for new features diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..104f3aa3 --- /dev/null +++ b/Makefile @@ -0,0 +1,74 @@ +.PHONY: all build install test clean lint fmt vet help + +# Binary name +BINARY_NAME=alterx +BINARY_PATH=./cmd/alterx + +# Go parameters +GOCMD=go +GOBUILD=$(GOCMD) build +GOCLEAN=$(GOCMD) clean +GOTEST=$(GOCMD) test +GOGET=$(GOCMD) get +GOMOD=$(GOCMD) mod +GOFMT=$(GOCMD) fmt +GOVET=$(GOCMD) vet + +# Build flags +LDFLAGS=-s -w + +all: build ## Build the project + +build: ## Build the binary + @echo "Building $(BINARY_NAME)..." + $(GOBUILD) -ldflags="$(LDFLAGS)" -o $(BINARY_NAME) $(BINARY_PATH) + @echo "Build complete: ./$(BINARY_NAME)" + +install: ## Install the binary to $GOPATH/bin + @echo "Installing $(BINARY_NAME)..." + $(GOCMD) install $(BINARY_PATH) + @echo "Install complete" + +test: ## Run tests + @echo "Running tests..." + $(GOTEST) -v ./... + +test-coverage: ## Run tests with coverage + @echo "Running tests with coverage..." + $(GOTEST) -v -coverprofile=coverage.out ./... + $(GOCMD) tool cover -html=coverage.out -o coverage.html + @echo "Coverage report generated: coverage.html" + +lint: ## Run linter (requires golangci-lint) + @echo "Running linter..." + @which golangci-lint > /dev/null || (echo "golangci-lint not installed. Install: https://golangci-lint.run/usage/install/" && exit 1) + golangci-lint run ./... + +fmt: ## Format code + @echo "Formatting code..." + $(GOFMT) ./... + +vet: ## Run go vet + @echo "Running go vet..." + $(GOVET) ./... + +clean: ## Clean build artifacts + @echo "Cleaning..." + $(GOCLEAN) + rm -f $(BINARY_NAME) + rm -f coverage.out coverage.html + @echo "Clean complete" + +deps: ## Download dependencies + @echo "Downloading dependencies..." + $(GOMOD) download + $(GOMOD) tidy + +run: build ## Build and run the binary + ./$(BINARY_NAME) -h + +help: ## Show this help message + @echo "Available targets:" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' + + diff --git a/README.md b/README.md index 69704f56..9f8358a9 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,9 @@ - **Automatic word enrichment** - Pre-defined variables - **Configurable Patterns** +- **Pattern Mining** - Automatically discover subdomain patterns (Go port of [Regulator](https://github.com/cramppet/regulator)) - STDIN / List input +- Multiple operation modes (default, discover, both) ## Installation To install alterx, you need to have Golang 1.19 installed on your system. You can download Golang from [here](https://go.dev/doc/install). After installing Golang, you can use the following command to install alterx: @@ -45,6 +47,31 @@ To install alterx, you need to have Golang 1.19 installed on your system. You ca go install github.com/projectdiscovery/alterx/cmd/alterx@latest ``` +### Building from Source +```bash +# Clone the repository +git clone https://github.com/projectdiscovery/alterx.git +cd alterx + +# Build using Makefile +make build + +# Or build manually +go build ./cmd/alterx +``` + +Available Makefile targets: +```bash +make help # Show all available targets +make build # Build the binary +make test # Run tests +make test-coverage # Run tests with coverage +make lint # Run linter +make fmt # Format code +make clean # Clean build artifacts +make install # Install to $GOPATH/bin +``` + ## Help Menu You can use the following command to see the available flags and options: diff --git a/cmd/alterx/main.go b/cmd/alterx/main.go index 3bed8a97..5c158a5d 100644 --- a/cmd/alterx/main.go +++ b/cmd/alterx/main.go @@ -1,7 +1,6 @@ package main import ( - "context" "io" "os" "strings" @@ -10,6 +9,7 @@ import ( "github.com/projectdiscovery/alterx/internal/patternmining" "github.com/projectdiscovery/alterx/internal/runner" "github.com/projectdiscovery/gologger" + "golang.org/x/net/publicsuffix" ) func main() { @@ -21,15 +21,23 @@ func main() { gologger.Fatal().Msgf("invalid mode: %s (must be 'default', 'discover', or 'both')", cliOpts.Mode) } + // Write output with deduplication + output := getOutputWriter(cliOpts.Output) + defer closeOutput(output, cliOpts.Output) + // we intentionally remove all known subdomains from the output + // that way only the discovered subdomains are included in the output + dedupWriter := alterx.NewDedupingWriter(output, cliOpts.Domains...) + defer dedupWriter.Close() + + var estimatedDiscoverOutputs = 0 + // Handle pattern mining modes (discover or both) var minedPatterns []string if cliOpts.Mode == "discover" || cliOpts.Mode == "both" { - target := extractTargetDomain(cliOpts.Domains) + target := getNValidateRootDomain(cliOpts.Domains) if target == "" { gologger.Fatal().Msgf("pattern mining requires domains with a common target (e.g., sub.example.com)") } - - gologger.Info().Msgf("Pattern mining mode enabled (Go port of Regulator by @cramppet)") gologger.Info().Msgf("Target domain: %s", target) miner := patternmining.NewMiner(&patternmining.Options{ @@ -57,23 +65,17 @@ func main() { } } + estimatedDiscoverOutputs = int(miner.EstimateCount(result.Patterns)) + // Generate subdomains from discovered patterns + // and exit early if cliOpts.Mode == "discover" { // In discover mode, only use mined patterns generated := miner.GenerateFromPatterns(result.Patterns) - - // Write output - output := getOutputWriter(cliOpts.Output) - defer closeOutput(output, cliOpts.Output) - for _, subdomain := range generated { - if cliOpts.Limit > 0 && len(generated) >= cliOpts.Limit { - break - } - output.Write([]byte(subdomain + "\n")) + dedupWriter.Write([]byte(subdomain + "\n")) } - - gologger.Info().Msgf("Generated %d subdomains from discovered patterns", len(generated)) + gologger.Info().Msgf("Generated %d unique subdomains from discovered patterns", dedupWriter.Count()) return } @@ -105,82 +107,22 @@ func main() { } } - // In 'both' mode, add mined patterns to user patterns - if cliOpts.Mode == "both" && len(minedPatterns) > 0 { - // Convert mined patterns to alterx format - // Mined patterns are already in regex format, but alterx expects template format - // For now, we'll generate from mined patterns separately and combine results - target := extractTargetDomain(cliOpts.Domains) - miner := patternmining.NewMiner(&patternmining.Options{ - Domains: cliOpts.Domains, - Target: target, - MinDistance: cliOpts.MinDistance, - MaxDistance: cliOpts.MaxDistance, - PatternThreshold: cliOpts.PatternThreshold, - QualityRatio: float64(cliOpts.QualityRatio), - MaxLength: 1000, - NgramsLimit: cliOpts.NgramsLimit, - }) - - generated := miner.GenerateFromPatterns(minedPatterns) - - // Use a dedupe set for both modes - allResults := make(map[string]bool) - for _, g := range generated { - allResults[g] = true - } - - // Now run the normal alterx generation - output := getOutputWriter(cliOpts.Output) - defer closeOutput(output, cliOpts.Output) - - m, err := alterx.New(&alterOpts) - if err != nil { - gologger.Fatal().Msgf("failed to parse alterx config got %v", err) - } - - if cliOpts.Estimate { - estimated := m.EstimateCount() + len(generated) - gologger.Info().Msgf("Estimated Payloads (including duplicates): %v", estimated) - return - } - - // First write mined results - count := 0 - for subdomain := range allResults { - if cliOpts.Limit > 0 && count >= cliOpts.Limit { - break - } - output.Write([]byte(subdomain + "\n")) - count++ - } - - // Then write alterx results (with deduplication) - if err = executeAlterxWithDedup(m, output, allResults, cliOpts.Limit-count); err != nil { - gologger.Error().Msgf("failed to write output to file got %v", err) - } - - gologger.Info().Msgf("Generated %d total unique subdomains (both modes)", len(allResults)) - return - } - - // Standard default mode - output := getOutputWriter(cliOpts.Output) - defer closeOutput(output, cliOpts.Output) - m, err := alterx.New(&alterOpts) if err != nil { gologger.Fatal().Msgf("failed to parse alterx config got %v", err) } if cliOpts.Estimate { - gologger.Info().Msgf("Estimated Payloads (including duplicates): %v", m.EstimateCount()) + estimated := m.EstimateCount() + estimatedDiscoverOutputs + gologger.Info().Msgf("Estimated Payloads (including duplicates): %v", estimated) return } - - if err = m.ExecuteWithWriter(output); err != nil { + // Write alterx results to same dedupWriter (automatic deduplication) + if err = m.ExecuteWithWriter(dedupWriter); err != nil { gologger.Error().Msgf("failed to write output to file got %v", err) } + + gologger.Info().Msgf("Generated %d total unique subdomains (both modes)", dedupWriter.Count()) } // extractTargetDomain extracts the common target domain from input domains @@ -220,22 +162,25 @@ func closeOutput(output io.Writer, outputPath string) { } } -// executeAlterxWithDedup executes alterx with deduplication against existing results -func executeAlterxWithDedup(m *alterx.Mutator, output io.Writer, existing map[string]bool, remainingLimit int) error { - // We need to capture alterx output and dedupe it - // Create a custom writer that dedupes - count := 0 - resChan := m.Execute(context.TODO()) +func getNValidateRootDomain(domains []string) string { + if len(domains) == 0 { + return "" + } - for value := range resChan { - if remainingLimit > 0 && count >= remainingLimit { + var rootDomain string + // parse root domain from publicsuffix for first entry + for _, domain := range domains { + if strings.TrimSpace(domain) == "" { continue } - if !existing[value] && !strings.HasPrefix(value, "-") { - existing[value] = true - output.Write([]byte(value + "\n")) - count++ + if rootDomain == "" { + root, _ := publicsuffix.EffectiveTLDPlusOne(domain) + rootDomain = root + } else { + if !strings.HasSuffix(domain, rootDomain) { + gologger.Fatal().Msgf("domain %v does not have the same root domain as %v, only homogeneous domains are supported in discover mode", domain, rootDomain) + } } } - return nil + return "" } diff --git a/dedupe_writer.go b/dedupe_writer.go new file mode 100644 index 00000000..3551100a --- /dev/null +++ b/dedupe_writer.go @@ -0,0 +1,143 @@ +package alterx + +import ( + "bufio" + "bytes" + "io" + "strings" + "sync" + + "github.com/projectdiscovery/utils/dedupe" +) + +// DedupingWriter wraps an io.Writer with transparent deduplication using dedupe utils +type DedupingWriter struct { + writer io.Writer + inputCh chan string + blacklist map[string]bool + wg sync.WaitGroup + count int + countMu sync.Mutex + closed bool + buffer []byte +} + +// NewDedupingWriter creates a new DedupingWriter with optional blacklist/seed +// The seed parameter allows pre-populating items to skip +func NewDedupingWriter(w io.Writer, seed ...string) *DedupingWriter { + blacklist := make(map[string]bool, len(seed)) + for _, item := range seed { + blacklist[item] = true + } + + inputCh := make(chan string, 100) + dw := &DedupingWriter{ + writer: w, + inputCh: inputCh, + blacklist: blacklist, + buffer: make([]byte, 0), + } + + // Start async dedupe processing + dw.wg.Add(1) + go dw.processDeduped(inputCh) + + return dw +} + +// processDeduped handles the dedupe output and writes to underlying writer +func (dw *DedupingWriter) processDeduped(inputCh chan string) { + defer dw.wg.Done() + + // Create dedupe instance (it handles backend selection internally) + d := dedupe.NewDedupe(inputCh, 1024*1024) // 1MB estimate for byte length + d.Drain() + outputCh := d.GetResults() + + // Read deduplicated results and write to underlying writer + for value := range outputCh { + // Skip if in blacklist + if dw.blacklist[value] { + continue + } + + // Skip empty lines and lines starting with '-' + if value == "" || strings.HasPrefix(value, "-") { + continue + } + + // Write to underlying writer + if _, err := dw.writer.Write([]byte(value + "\n")); err != nil { + // In a real-world scenario, we might want to handle this error + // For now, we continue processing + continue + } + + // Increment count + dw.countMu.Lock() + dw.count++ + dw.countMu.Unlock() + } +} + +// Write implements io.Writer interface +func (dw *DedupingWriter) Write(p []byte) (int, error) { + if dw.closed { + return 0, io.ErrClosedPipe + } + + originalLen := len(p) + + // Append to buffer to handle incomplete lines + dw.buffer = append(dw.buffer, p...) + + // Process complete lines + scanner := bufio.NewScanner(bytes.NewReader(dw.buffer)) + lastIdx := 0 + for scanner.Scan() { + line := scanner.Text() + lastIdx += len(line) + 1 // +1 for newline + + // Send to dedupe input channel + dw.inputCh <- line + } + + // Keep incomplete line in buffer + if lastIdx < len(dw.buffer) { + dw.buffer = dw.buffer[lastIdx:] + } else { + dw.buffer = dw.buffer[:0] + } + + // Always return original length to satisfy io.Writer contract + return originalLen, nil +} + +// Close flushes any remaining data and closes the writer +func (dw *DedupingWriter) Close() error { + if dw.closed { + return nil + } + dw.closed = true + + // Process any remaining buffered data + if len(dw.buffer) > 0 { + line := string(dw.buffer) + dw.inputCh <- line + } + + // Close input channel to signal dedupe to finish + close(dw.inputCh) + + // Wait for dedupe processing to complete + dw.wg.Wait() + + return nil +} + +// Count returns the number of unique items written +func (dw *DedupingWriter) Count() int { + dw.countMu.Lock() + defer dw.countMu.Unlock() + return dw.count +} diff --git a/dedupe_writer_test.go b/dedupe_writer_test.go new file mode 100644 index 00000000..aae59b73 --- /dev/null +++ b/dedupe_writer_test.go @@ -0,0 +1,126 @@ +package alterx + +import ( + "bytes" + "testing" + "time" +) + +func TestDedupingWriter(t *testing.T) { + t.Run("basic deduplication using dedupe utils", func(t *testing.T) { + buf := &bytes.Buffer{} + dw := NewDedupingWriter(buf) + + // Write some duplicate data + dw.Write([]byte("test1\n")) + dw.Write([]byte("test2\n")) + dw.Write([]byte("test1\n")) // duplicate + dw.Write([]byte("test3\n")) + dw.Write([]byte("test2\n")) // duplicate + + // Close to flush and wait for async processing + dw.Close() + + // Give a moment for async processing to complete + time.Sleep(100 * time.Millisecond) + + if dw.Count() != 3 { + t.Errorf("Expected 3 unique items, got %d", dw.Count()) + } + + output := buf.String() + // Check all unique items are present (order may vary due to async) + if !contains(output, "test1\n") || !contains(output, "test2\n") || !contains(output, "test3\n") { + t.Errorf("Expected all unique items in output, got %q", output) + } + }) + + t.Run("with blacklist/seed", func(t *testing.T) { + buf := &bytes.Buffer{} + dw := NewDedupingWriter(buf, "test1", "test3") + + // Write data including items in blacklist + dw.Write([]byte("test1\n")) // in blacklist + dw.Write([]byte("test2\n")) + dw.Write([]byte("test3\n")) // in blacklist + dw.Write([]byte("test4\n")) + + dw.Close() + time.Sleep(100 * time.Millisecond) + + if dw.Count() != 2 { + t.Errorf("Expected 2 unique items (excluding blacklist), got %d", dw.Count()) + } + + output := buf.String() + // Should not contain blacklisted items + if contains(output, "test1\n") || contains(output, "test3\n") { + t.Errorf("Output should not contain blacklisted items, got %q", output) + } + // Should contain non-blacklisted items + if !contains(output, "test2\n") || !contains(output, "test4\n") { + t.Errorf("Output should contain test2 and test4, got %q", output) + } + }) + + t.Run("skip lines starting with dash", func(t *testing.T) { + buf := &bytes.Buffer{} + dw := NewDedupingWriter(buf) + + dw.Write([]byte("test1\n")) + dw.Write([]byte("-skip-this\n")) + dw.Write([]byte("test2\n")) + dw.Write([]byte("-skip-that\n")) + + dw.Close() + time.Sleep(100 * time.Millisecond) + + if dw.Count() != 2 { + t.Errorf("Expected 2 unique items (excluding dash lines), got %d", dw.Count()) + } + + output := buf.String() + if contains(output, "-skip") { + t.Errorf("Output should not contain lines starting with dash, got %q", output) + } + }) + + t.Run("handle multiple lines in single write", func(t *testing.T) { + buf := &bytes.Buffer{} + dw := NewDedupingWriter(buf) + + // Write multiple lines at once with duplicates + dw.Write([]byte("test1\ntest2\ntest1\ntest3\n")) + + dw.Close() + time.Sleep(100 * time.Millisecond) + + if dw.Count() != 3 { + t.Errorf("Expected 3 unique items, got %d", dw.Count()) + } + + output := buf.String() + if !contains(output, "test1\n") || !contains(output, "test2\n") || !contains(output, "test3\n") { + t.Errorf("Expected all unique items in output, got %q", output) + } + }) + + t.Run("skip empty lines", func(t *testing.T) { + buf := &bytes.Buffer{} + dw := NewDedupingWriter(buf) + + dw.Write([]byte("test1\n\ntest2\n\n")) + + dw.Close() + time.Sleep(100 * time.Millisecond) + + if dw.Count() != 2 { + t.Errorf("Expected 2 unique items (skipping empty), got %d", dw.Count()) + } + }) +} + +// Helper function to check if a string contains a substring +func contains(s, substr string) bool { + return bytes.Contains([]byte(s), []byte(substr)) +} diff --git a/internal/patternmining/patternmining.go b/internal/patternmining/patternmining.go index b35a47cc..4122bbec 100644 --- a/internal/patternmining/patternmining.go +++ b/internal/patternmining/patternmining.go @@ -334,6 +334,33 @@ func (m *Miner) SaveRules(result *Result, filename string) error { return enc.Encode(grouped) } +// EstimateCount estimates the number of subdomains that would be generated from patterns +// This uses the DFA's NumWords method to count without actually generating strings +func (m *Miner) EstimateCount(patterns []string) int64 { + var totalEstimate int64 + + for _, pattern := range patterns { + subdomainPattern := strings.TrimSuffix(pattern, "."+m.opts.Target) + if len(subdomainPattern) == 0 { + continue + } + + // Calculate fixed length for generation + tempEncoder := dank.NewDankEncoder(m.preparePattern(subdomainPattern), 1) + fixedSlice := tempEncoder.NumStates() - 2 + if fixedSlice < 0 { + fixedSlice = 0 + } + + encoder := dank.NewDankEncoder(m.preparePattern(subdomainPattern), fixedSlice) + // Use NumWords to count strings at fixed length without generating them + count := encoder.NumWords(fixedSlice, fixedSlice) + totalEstimate += count + } + + return totalEstimate +} + // GenerateFromPatterns generates subdomains from discovered patterns func (m *Miner) GenerateFromPatterns(patterns []string) []string { var results []string From 76f6ba1b03c95e051161dda3397a07bf170af224 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Mon, 10 Nov 2025 12:37:55 +0530 Subject: [PATCH 3/4] fix: resolve all linting errors and clean up build artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add proper error handling for all Write() and Close() operations - Use defer with error handlers for cleanup operations - Remove unused extractTargetDomain() function - Add coverage.html to .gitignore to exclude build artifacts All changes follow Go best practices: - Deferred error handlers with logging - t.Fatalf() for test error handling - Named return pattern for defer close error propagation Linting: 0 issues (previously 12 issues) Tests: All passing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .gitignore | 1 + cmd/alterx/main.go | 30 ++++------ dedupe_writer_test.go | 80 ++++++++++++++++++------- examples/main.go | 4 +- internal/patternmining/patternmining.go | 6 +- 5 files changed, 80 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index cf399397..2ca7b333 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ dist # Output of the go coverage tool, specifically when used with LiteIDE *.out +coverage.html # Dependency directories (remove the comment below to include it) # vendor/ diff --git a/cmd/alterx/main.go b/cmd/alterx/main.go index 5c158a5d..34f9f66d 100644 --- a/cmd/alterx/main.go +++ b/cmd/alterx/main.go @@ -27,7 +27,11 @@ func main() { // we intentionally remove all known subdomains from the output // that way only the discovered subdomains are included in the output dedupWriter := alterx.NewDedupingWriter(output, cliOpts.Domains...) - defer dedupWriter.Close() + defer func() { + if err := dedupWriter.Close(); err != nil { + gologger.Error().Msgf("failed to close dedup writer: %v", err) + } + }() var estimatedDiscoverOutputs = 0 @@ -73,7 +77,9 @@ func main() { // In discover mode, only use mined patterns generated := miner.GenerateFromPatterns(result.Patterns) for _, subdomain := range generated { - dedupWriter.Write([]byte(subdomain + "\n")) + if _, err := dedupWriter.Write([]byte(subdomain + "\n")); err != nil { + gologger.Error().Msgf("failed to write subdomain: %v", err) + } } gologger.Info().Msgf("Generated %d unique subdomains from discovered patterns", dedupWriter.Count()) return @@ -125,22 +131,6 @@ func main() { gologger.Info().Msgf("Generated %d total unique subdomains (both modes)", dedupWriter.Count()) } -// extractTargetDomain extracts the common target domain from input domains -func extractTargetDomain(domains []string) string { - if len(domains) == 0 { - return "" - } - - // Take the first domain and extract root domain - first := domains[0] - parts := strings.Split(first, ".") - if len(parts) >= 2 { - // Return last two parts as target domain (e.g., "example.com") - return strings.Join(parts[len(parts)-2:], ".") - } - return first -} - // getOutputWriter returns the appropriate output writer func getOutputWriter(outputPath string) io.Writer { if outputPath != "" { @@ -157,7 +147,9 @@ func getOutputWriter(outputPath string) io.Writer { func closeOutput(output io.Writer, outputPath string) { if outputPath != "" { if closer, ok := output.(io.Closer); ok { - closer.Close() + if err := closer.Close(); err != nil { + gologger.Error().Msgf("failed to close output file: %v", err) + } } } } diff --git a/dedupe_writer_test.go b/dedupe_writer_test.go index aae59b73..564f3880 100644 --- a/dedupe_writer_test.go +++ b/dedupe_writer_test.go @@ -12,14 +12,26 @@ func TestDedupingWriter(t *testing.T) { dw := NewDedupingWriter(buf) // Write some duplicate data - dw.Write([]byte("test1\n")) - dw.Write([]byte("test2\n")) - dw.Write([]byte("test1\n")) // duplicate - dw.Write([]byte("test3\n")) - dw.Write([]byte("test2\n")) // duplicate + if _, err := dw.Write([]byte("test1\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test2\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test1\n")); err != nil { // duplicate + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test3\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test2\n")); err != nil { // duplicate + t.Fatalf("failed to write: %v", err) + } // Close to flush and wait for async processing - dw.Close() + if err := dw.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } // Give a moment for async processing to complete time.Sleep(100 * time.Millisecond) @@ -40,12 +52,22 @@ func TestDedupingWriter(t *testing.T) { dw := NewDedupingWriter(buf, "test1", "test3") // Write data including items in blacklist - dw.Write([]byte("test1\n")) // in blacklist - dw.Write([]byte("test2\n")) - dw.Write([]byte("test3\n")) // in blacklist - dw.Write([]byte("test4\n")) + if _, err := dw.Write([]byte("test1\n")); err != nil { // in blacklist + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test2\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test3\n")); err != nil { // in blacklist + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test4\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } - dw.Close() + if err := dw.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { @@ -67,12 +89,22 @@ func TestDedupingWriter(t *testing.T) { buf := &bytes.Buffer{} dw := NewDedupingWriter(buf) - dw.Write([]byte("test1\n")) - dw.Write([]byte("-skip-this\n")) - dw.Write([]byte("test2\n")) - dw.Write([]byte("-skip-that\n")) + if _, err := dw.Write([]byte("test1\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("-skip-this\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("test2\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } + if _, err := dw.Write([]byte("-skip-that\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } - dw.Close() + if err := dw.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { @@ -90,9 +122,13 @@ func TestDedupingWriter(t *testing.T) { dw := NewDedupingWriter(buf) // Write multiple lines at once with duplicates - dw.Write([]byte("test1\ntest2\ntest1\ntest3\n")) + if _, err := dw.Write([]byte("test1\ntest2\ntest1\ntest3\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } - dw.Close() + if err := dw.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } time.Sleep(100 * time.Millisecond) if dw.Count() != 3 { @@ -109,9 +145,13 @@ func TestDedupingWriter(t *testing.T) { buf := &bytes.Buffer{} dw := NewDedupingWriter(buf) - dw.Write([]byte("test1\n\ntest2\n\n")) + if _, err := dw.Write([]byte("test1\n\ntest2\n\n")); err != nil { + t.Fatalf("failed to write: %v", err) + } - dw.Close() + if err := dw.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { diff --git a/examples/main.go b/examples/main.go index 1dd52388..fd06a48c 100644 --- a/examples/main.go +++ b/examples/main.go @@ -20,5 +20,7 @@ func main() { if err != nil { gologger.Fatal().Msg(err.Error()) } - m.ExecuteWithWriter(os.Stdout) + if err := m.ExecuteWithWriter(os.Stdout); err != nil { + gologger.Fatal().Msgf("failed to execute: %v", err) + } } diff --git a/internal/patternmining/patternmining.go b/internal/patternmining/patternmining.go index 4122bbec..5856b698 100644 --- a/internal/patternmining/patternmining.go +++ b/internal/patternmining/patternmining.go @@ -324,7 +324,11 @@ func (m *Miner) SaveRules(result *Result, filename string) error { if err != nil { return err } - defer f.Close() + defer func() { + if closeErr := f.Close(); closeErr != nil && err == nil { + err = closeErr + } + }() // Group patterns by step with metadata grouped := m.groupRulesByStep(result.Metadata) From 4e32f35dd2b5d07fa414fbc4d512c3d4df040042 Mon Sep 17 00:00:00 2001 From: Tarun Koyalwar Date: Mon, 10 Nov 2025 13:27:21 +0530 Subject: [PATCH 4/4] fix: address PR review comments - critical and major issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed multiple critical and major issues identified in code review: Critical fixes: - Fix getNValidateRootDomain to return computed root domain - Add proper error handling for publicsuffix.EffectiveTLDPlusOne - Strengthen domain validation with dot boundary check - Fix dedupe_writer buffer handling to process only complete lines - Replace bufio.Scanner with bytes.IndexByte for correct partial write handling Major fixes: - Include MaxDistance in clustering loop bounds (Phase 1 and Phase 3) - Fix nwords type assertion from int to int64 with proper casting Improvements: - Remove unnecessary time.Sleep calls from tests (Close() already blocks) - Clean up unused imports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- cmd/alterx/main.go | 9 ++++++--- dedupe_writer.go | 22 ++++++++-------------- dedupe_writer_test.go | 8 -------- internal/patternmining/patternmining.go | 8 ++++---- 4 files changed, 18 insertions(+), 29 deletions(-) diff --git a/cmd/alterx/main.go b/cmd/alterx/main.go index 34f9f66d..964f516c 100644 --- a/cmd/alterx/main.go +++ b/cmd/alterx/main.go @@ -166,13 +166,16 @@ func getNValidateRootDomain(domains []string) string { continue } if rootDomain == "" { - root, _ := publicsuffix.EffectiveTLDPlusOne(domain) + root, err := publicsuffix.EffectiveTLDPlusOne(domain) + if err != nil || root == "" { + gologger.Fatal().Msgf("failed to derive root domain from %v: %v", domain, err) + } rootDomain = root } else { - if !strings.HasSuffix(domain, rootDomain) { + if domain != rootDomain && !strings.HasSuffix(domain, "."+rootDomain) { gologger.Fatal().Msgf("domain %v does not have the same root domain as %v, only homogeneous domains are supported in discover mode", domain, rootDomain) } } } - return "" + return rootDomain } diff --git a/dedupe_writer.go b/dedupe_writer.go index 3551100a..0ce1617e 100644 --- a/dedupe_writer.go +++ b/dedupe_writer.go @@ -1,7 +1,6 @@ package alterx import ( - "bufio" "bytes" "io" "strings" @@ -92,21 +91,16 @@ func (dw *DedupingWriter) Write(p []byte) (int, error) { dw.buffer = append(dw.buffer, p...) // Process complete lines - scanner := bufio.NewScanner(bytes.NewReader(dw.buffer)) - lastIdx := 0 - for scanner.Scan() { - line := scanner.Text() - lastIdx += len(line) + 1 // +1 for newline + for { + idx := bytes.IndexByte(dw.buffer, '\n') + if idx == -1 { + break + } - // Send to dedupe input channel + line := string(dw.buffer[:idx]) dw.inputCh <- line - } - - // Keep incomplete line in buffer - if lastIdx < len(dw.buffer) { - dw.buffer = dw.buffer[lastIdx:] - } else { - dw.buffer = dw.buffer[:0] + // Drop processed line plus newline + dw.buffer = dw.buffer[idx+1:] } // Always return original length to satisfy io.Writer contract diff --git a/dedupe_writer_test.go b/dedupe_writer_test.go index 564f3880..4f9c5c79 100644 --- a/dedupe_writer_test.go +++ b/dedupe_writer_test.go @@ -3,7 +3,6 @@ package alterx import ( "bytes" "testing" - "time" ) func TestDedupingWriter(t *testing.T) { @@ -33,9 +32,6 @@ func TestDedupingWriter(t *testing.T) { t.Fatalf("failed to close: %v", err) } - // Give a moment for async processing to complete - time.Sleep(100 * time.Millisecond) - if dw.Count() != 3 { t.Errorf("Expected 3 unique items, got %d", dw.Count()) } @@ -68,7 +64,6 @@ func TestDedupingWriter(t *testing.T) { if err := dw.Close(); err != nil { t.Fatalf("failed to close: %v", err) } - time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { t.Errorf("Expected 2 unique items (excluding blacklist), got %d", dw.Count()) @@ -105,7 +100,6 @@ func TestDedupingWriter(t *testing.T) { if err := dw.Close(); err != nil { t.Fatalf("failed to close: %v", err) } - time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { t.Errorf("Expected 2 unique items (excluding dash lines), got %d", dw.Count()) @@ -129,7 +123,6 @@ func TestDedupingWriter(t *testing.T) { if err := dw.Close(); err != nil { t.Fatalf("failed to close: %v", err) } - time.Sleep(100 * time.Millisecond) if dw.Count() != 3 { t.Errorf("Expected 3 unique items, got %d", dw.Count()) @@ -152,7 +145,6 @@ func TestDedupingWriter(t *testing.T) { if err := dw.Close(); err != nil { t.Fatalf("failed to close: %v", err) } - time.Sleep(100 * time.Millisecond) if dw.Count() != 2 { t.Errorf("Expected 2 unique items (skipping empty), got %d", dw.Count()) diff --git a/internal/patternmining/patternmining.go b/internal/patternmining/patternmining.go index 5856b698..cc76da09 100644 --- a/internal/patternmining/patternmining.go +++ b/internal/patternmining/patternmining.go @@ -116,7 +116,7 @@ func (m *Miner) Mine() (*Result, error) { // Phase 1: No enforced prefix - edit distance clustering gologger.Verbose().Msgf("Phase 1: Edit distance clustering...") - for k := m.opts.MinDistance; k < m.opts.MaxDistance; k++ { + for k := m.opts.MinDistance; k <= m.opts.MaxDistance; k++ { closures := m.editClosures(knownHosts, k) for _, closure := range closures { if len(closure) <= 1 { @@ -214,7 +214,7 @@ func (m *Miner) Mine() (*Result, error) { // Apply edit distance clustering within prefix group if len(prefix) > 1 { - for kk := m.opts.MinDistance; kk < m.opts.MaxDistance; kk++ { + for kk := m.opts.MinDistance; kk <= m.opts.MaxDistance; kk++ { closures := m.editClosures(keys2, kk) for _, closure := range closures { rUn, _ := m.closureToRegex(false, closure) @@ -498,8 +498,8 @@ func (m *Miner) groupRulesByStep(rules map[string]map[string]interface{}) *Rules if clusterSize, ok := meta["cluster_size"].(int); ok { patternMeta.ClusterSize = clusterSize } - if nwords, ok := meta["nwords"].(int); ok { - patternMeta.Nwords = nwords + if nwords, ok := meta["nwords"].(int64); ok { + patternMeta.Nwords = int(nwords) } if ratio, ok := meta["ratio"].(float64); ok { patternMeta.Ratio = ratio