From e6dbf5d8a29850ff665e403557d2856fba29340b Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 31 May 2026 01:42:19 +0000 Subject: [PATCH] feat: Add granular NumberMode for parsing numbers Replaces the boolean `NumberSplitting` flag with a `NumberMode` enum, allowing for finer control over digit boundaries. Introduces `MergeRecursive` (which merges digits with words but splits before uppercase letters) and `TreatAsLowercase` (which treats digits identically to lowercase letters). The existing `WithNumberSplitting` option is retained for backward compatibility and mapped to the new modes. Fixes #23. Co-authored-by: arran4 <111667+arran4@users.noreply.github.com> --- parser.go | 47 +++++++++++++++++++++++++++++++++---------- parts.go | 31 +++++++++++++++++++++++----- parts_num_test.go | 51 +++++++++++++++++++++++++++++++++++++++++++++++ types.go | 4 ++-- 4 files changed, 116 insertions(+), 17 deletions(-) create mode 100644 parts_num_test.go diff --git a/parser.go b/parser.go index 5497387..4d705e7 100644 --- a/parser.go +++ b/parser.go @@ -18,8 +18,8 @@ func Parse(input string, opts ...any) ([]Word, error) { subs, stats := StringToSubParts(input) p := &ParserConfig{ - SmartAcronyms: true, - NumberSplitting: false, + SmartAcronyms: true, + NumberMode: NumberModeNone, } for _, opt := range opts { @@ -55,10 +55,25 @@ type ParserConfig struct { // should be treated as AcronymWord instead of UpperCaseWord. // Defaults to true. SmartAcronyms bool - // NumberSplitting controls whether to split on letter-digit boundaries. - NumberSplitting bool + // NumberMode controls how numbers are handled during word splitting. + NumberMode NumberMode } +// NumberMode defines the strategy for handling numbers during parsing. +type NumberMode int + +const ( + // NumberModeNone does not perform any special number splitting. + NumberModeNone NumberMode = iota + // NumberModeSplitAlways splits on any transition between a letter and a digit. + NumberModeSplitAlways + // NumberModeMergeRecursive treats digits as compatible with both preceding and succeeding lowercase letters, + // preventing splits like 123test -> 123-test. + NumberModeMergeRecursive + // NumberModeTreatAsLowercase treats digits exactly as if they were lowercase letters for boundary detection. + NumberModeTreatAsLowercase +) + // ParserOption configures the parser. type ParserOption interface { Apply(*ParserConfig) @@ -91,9 +106,21 @@ func WithSmartAcronyms(enabled bool) ParserOption { } // WithNumberSplitting enables or disables splitting on letter-digit boundaries. +// It is equivalent to WithNumberMode(NumberModeSplitAlways) when true, and WithNumberMode(NumberModeNone) when false. func WithNumberSplitting(enabled bool) ParserOption { return funcParserOption(func(p *ParserConfig) { - p.NumberSplitting = enabled + if enabled { + p.NumberMode = NumberModeSplitAlways + } else { + p.NumberMode = NumberModeNone + } + }) +} + +// WithNumberMode sets the specific number splitting mode. +func WithNumberMode(mode NumberMode) ParserOption { + return funcParserOption(func(p *ParserConfig) { + p.NumberMode = mode }) } @@ -123,15 +150,15 @@ func DetectPartitioner(stats Stats, config ...*ParserConfig) Partitioner { } } - splitNumber := false + numberMode := NumberModeNone if len(config) > 0 && config[0] != nil { - splitNumber = config[0].NumberSplitting + numberMode = config[0].NumberMode } return NewPartitioner(PartitionerConfig{ - Delimiters: delimiters, - SplitCamel: true, - SplitNumber: splitNumber, + Delimiters: delimiters, + SplitCamel: true, + NumberMode: numberMode, }) } diff --git a/parts.go b/parts.go index 65ded37..541b172 100644 --- a/parts.go +++ b/parts.go @@ -68,7 +68,7 @@ func CamelCasePartitioner(subs []SubPart) []Part { type PartitionerConfig struct { Delimiters map[rune]bool SplitCamel bool - SplitNumber bool + NumberMode NumberMode PreserveSep bool // If true, delimiters are returned as SeparatorPart instead of discarded } @@ -93,27 +93,48 @@ func NewPartitioner(cfg PartitionerConfig) Partitioner { // Transition check isSplit := false - if (cfg.SplitCamel || cfg.SplitNumber) && i > 0 && len(current) > 0 { + if (cfg.SplitCamel || cfg.NumberMode != NumberModeNone) && i > 0 && len(current) > 0 { prev := subs[i-1] // Note: if prev was delimiter, current is empty or started anew. // We rely on current being non-empty to check transitions within a word chunk. if cfg.SplitCamel { + isPrevLower := prev.IsLower() + isPrevUpper := prev.IsUpper() + isCurrUpper := s.IsUpper() + + if cfg.NumberMode == NumberModeTreatAsLowercase { + if prev.IsDigit() { + isPrevLower = true + } + } + // lower -> Upper - if prev.IsLower() && s.IsUpper() { + if isPrevLower && isCurrUpper { isSplit = true } // Upper -> Upper -> lower (PDFLoader split at L) if i+1 < len(subs) { next := subs[i+1] - if prev.IsUpper() && s.IsUpper() && next.IsLower() { + isNextLower := next.IsLower() + if cfg.NumberMode == NumberModeTreatAsLowercase && next.IsDigit() { + isNextLower = true + } + if isPrevUpper && isCurrUpper && isNextLower { + isSplit = true + } + } + + // MergeRecursive specific rule: digit -> Upper triggers a split, similar to lower -> Upper + if cfg.NumberMode == NumberModeMergeRecursive { + if prev.IsDigit() && isCurrUpper { isSplit = true } } } - if cfg.SplitNumber { + if cfg.NumberMode == NumberModeSplitAlways { // Letter -> Digit -> Split. // Digit -> Letter -> Split. if prev.IsLetter() && s.IsDigit() { diff --git a/parts_num_test.go b/parts_num_test.go new file mode 100644 index 0000000..e17975c --- /dev/null +++ b/parts_num_test.go @@ -0,0 +1,51 @@ +package strings2 + +import ( + "reflect" + "testing" +) + +func TestNumberMode(t *testing.T) { + tests := []struct { + name string + input string + mode NumberMode + expected []string + }{ + // None + {"None_User123ID", "User123ID", NumberModeNone, []string{"User123ID"}}, + {"None_UPPER123", "UPPER123", NumberModeNone, []string{"UPPER123"}}, + {"None_123test", "123test", NumberModeNone, []string{"123test"}}, + + // SplitAlways + {"SplitAlways_User123ID", "User123ID", NumberModeSplitAlways, []string{"User", "123", "ID"}}, + {"SplitAlways_UPPER123", "UPPER123", NumberModeSplitAlways, []string{"UPPER", "123"}}, + {"SplitAlways_123test", "123test", NumberModeSplitAlways, []string{"123", "test"}}, + + // MergeRecursive + {"MergeRecursive_User123ID", "User123ID", NumberModeMergeRecursive, []string{"User123", "ID"}}, + {"MergeRecursive_UPPER123", "UPPER123", NumberModeMergeRecursive, []string{"UPPER123"}}, + {"MergeRecursive_123test", "123test", NumberModeMergeRecursive, []string{"123test"}}, + + // TreatAsLowercase + {"TreatAsLowercase_User123ID", "User123ID", NumberModeTreatAsLowercase, []string{"User123", "ID"}}, + {"TreatAsLowercase_UPPER123", "UPPER123", NumberModeTreatAsLowercase, []string{"UPPE", "R123"}}, + {"TreatAsLowercase_123test", "123test", NumberModeTreatAsLowercase, []string{"123test"}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + words, err := Parse(tt.input, WithNumberMode(tt.mode)) + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + var got []string + for _, w := range words { + got = append(got, w.String()) + } + if !reflect.DeepEqual(got, tt.expected) { + t.Errorf("Parse(%q) with mode %v = %v; want %v", tt.input, tt.mode, got, tt.expected) + } + }) + } +} diff --git a/types.go b/types.go index 1bf1568..1766082 100644 --- a/types.go +++ b/types.go @@ -413,8 +413,8 @@ func WordsToFormattedCase(words []Word, opts ...any) (string, error) { func PartsToFormattedCase(parts []Part, opts ...any) (string, error) { // Extract ParserConfig from opts to use for classification p := &ParserConfig{ - SmartAcronyms: true, - NumberSplitting: false, + SmartAcronyms: true, + NumberMode: NumberModeNone, } for _, opt := range opts { if o, ok := opt.(ParserOption); ok {