From eae43d36312e79462b30659895c6eec6c2e65e42 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Mon, 24 Nov 2025 17:59:53 -0500 Subject: [PATCH 01/10] chore: remove deprecations --- UnicodeBasic.lean | 141 -------------------- UnicodeBasic/TableLookup.lean | 5 - UnicodeBasic/Types.lean | 243 ---------------------------------- UnicodeData/Basic.lean | 28 ---- 4 files changed, 417 deletions(-) diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index c8eead4e6..aad7dd988 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -137,32 +137,11 @@ where .Sk, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ps, .Sm, .Po, .Sm, .Cc] -set_option linter.deprecated false in -@[deprecated Unicode.getGC (since := "1.2.0")] -def getGeneralCategory (char : Char) : GeneralCategory := - .ofGC! (getGC char) - instance : Membership Char GC where mem cat char := getGC char ⊆ cat instance (char : Char) (cat : GC) : Decidable (char ∈ cat) := inferInstanceAs (Decidable (_ ⊆ _)) -set_option linter.deprecated false in -@[deprecated "use `char ∈ category` instead" (since := "1.2.0")] -def isInGeneralCategory (char : Char) (category : GeneralCategory) : Bool := - match category, getGeneralCategory char with - | ⟨major, none⟩, ⟨charMajor, _⟩ => major == charMajor - | ⟨_, some .casedLetter⟩, ⟨_, some .lowercaseLetter⟩ => true - | ⟨_, some .casedLetter⟩, ⟨_, some .titlecaseLetter⟩ => true - | ⟨_, some .casedLetter⟩, ⟨_, some .uppercaseLetter⟩ => true - | ⟨_, some .casedLetter⟩, _ => false - | ⟨_, some .groupPunctuation⟩, ⟨_, some .openPunctuation⟩ => true - | ⟨_, some .groupPunctuation⟩, ⟨_, some .closePunctuation⟩ => true - | ⟨_, some .groupPunctuation⟩, _ => false - | ⟨_, some .quotePunctuation⟩, ⟨_, some .initialPunctuation⟩ => true - | ⟨_, some .quotePunctuation⟩, ⟨_, some .finalPunctuation⟩ => true - | ⟨_, some .quotePunctuation⟩, _ => false - | cat, charCat => cat == charCat namespace GeneralCategory @@ -173,33 +152,21 @@ namespace GeneralCategory Unicode Property: `General_Category=L` -/ abbrev isLetter (char : Char) : Bool := char ∈ Unicode.GC.L -@[deprecated "c ∈ Unicode.GC.L" (since := "1.2.0")] -protected abbrev isL := isLetter - /-- Check if lowercase letter character (`Ll`) Unicode Property: `General_Category=Ll` -/ abbrev isLowercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Ll -@[deprecated "c ∈ Unicode.GC.Ll" (since := "1.2.0")] -protected abbrev isLl := isLowercaseLetter - /-- Check if titlecase letter character (`Lt`) Unicode Property: `General_Category=Lt` -/ abbrev isTitlecaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lt -@[deprecated "c ∈ Unicode.GC.Lt" (since := "1.2.0")] -protected abbrev isLt := isTitlecaseLetter - /-- Check if uppercase letter character (`Lu`) Unicode Property: `General_Category=Lu` -/ abbrev isUppercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lu -@[deprecated "c ∈ Unicode.GC.Lu" (since := "1.2.0")] -protected abbrev isLu := isUppercaseLetter - /-- Check if cased letter character (`LC`) This is a derived category (`L = Lu | Ll | Lt`). @@ -207,25 +174,16 @@ protected abbrev isLu := isUppercaseLetter Unicode Property: `General_Category=LC` -/ abbrev isCasedLetter (char : Char) : Bool := char ∈ Unicode.GC.LC -@[deprecated "c ∈ Unicode.GC.LC" (since := "1.2.0")] -protected abbrev isLC := isCasedLetter - /-- Check if modifier letter character (`Lm`) Unicode Property: `General_Category=Lm`-/ abbrev isModifierLetter (char : Char) : Bool := char ∈ Unicode.GC.Lm -@[deprecated "c ∈ Unicode.GC.Lm" (since := "1.2.0")] -protected abbrev isLm := isModifierLetter - /-- Check if other letter character (`Lo`) Unicode Property: `General_Category=Lo`-/ abbrev isOtherLetter (char : Char) : Bool := char ∈ Unicode.GC.Lo -@[deprecated "c ∈ Unicode.GC.Lo" (since := "1.2.0")] -protected abbrev isLo := isOtherLetter - /-- Check if mark character (`M`) This is a derived category (`M = Mn | Mc | Me`). @@ -233,33 +191,21 @@ protected abbrev isLo := isOtherLetter Unicode Property: `General_Category=M` -/ abbrev isMark (char : Char) : Bool := char ∈ Unicode.GC.M -@[deprecated "c ∈ Unicode.GC.M" (since := "1.2.0")] -protected abbrev isM := isMark - /-- Check if nonspacing combining mark character (`Mn`) Unicode Property: `General_Category=Mn` -/ abbrev isNonspacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mn -@[deprecated "c ∈ Unicode.GC.Mn" (since := "1.2.0")] -protected abbrev isMn := isNonspacingMark - /-- Check if spacing combining mark character (`Mc`) Unicode Property: `General_Category=Mc` -/ abbrev isSpacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mc -@[deprecated "c ∈ Unicode.GC.Mc" (since := "1.2.0")] -protected abbrev isMc := isSpacingMark - /-- Check if enclosing combining mark character (`Me`) Unicode Property: `General_Category=Me` -/ abbrev isEnclosingMark (char : Char) : Bool := char ∈ Unicode.GC.Me -@[deprecated "c ∈ Unicode.GC.Me" (since := "1.2.0")] -protected abbrev isMe := isEnclosingMark - /-- Check if number character (`N`) This is a derived category (`N = Nd | Nl | No`). @@ -267,33 +213,21 @@ protected abbrev isMe := isEnclosingMark Unicode Property: `General_Category=N` -/ abbrev isNumber (char : Char) : Bool := char ∈ Unicode.GC.N -@[deprecated "c ∈ Unicode.GC.N" (since := "1.2.0")] -protected abbrev isN := isNumber - /-- Check if decimal number character (`Nd`) Unicode Property: `General_Category=Nd` -/ abbrev isDecimalNumber (char : Char) : Bool := char ∈ Unicode.GC.Nd -@[deprecated "c ∈ Unicode.GC.Nd" (since := "1.2.0")] -protected abbrev isNd := isDecimalNumber - /-- Check if letter number character (`Nl`) Unicode Property: `General_Category=Nl` -/ abbrev isLetterNumber (char : Char) : Bool := char ∈ Unicode.GC.Nl -@[deprecated "c ∈ Unicode.GC.Nl" (since := "1.2.0")] -protected abbrev isNl := isLetterNumber - /-- Check if other number character (`No`) Unicode Property: `General_Category=No` -/ abbrev isOtherNumber (char : Char) : Bool := char ∈ Unicode.GC.No -@[deprecated "c ∈ Unicode.GC.No" (since := "1.2.0")] -protected abbrev isNo := isOtherNumber - /-- Check if punctuation character (`P`) This is a derived category (`P = Pc | Pd | Ps | Pe | Pi | Pf | Po`). @@ -301,25 +235,16 @@ protected abbrev isNo := isOtherNumber Unicode Property: `General_Category=P` -/ abbrev isPunctuation (char : Char) : Bool := char ∈ Unicode.GC.P -@[deprecated "c ∈ Unicode.GC.P" (since := "1.2.0")] -protected abbrev isP := isPunctuation - /-- Check if connector punctuation character (`Pc`) Unicode Property: `General_Category=Pc` -/ abbrev isConnectorPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pc -@[deprecated "c ∈ Unicode.GC.Pc" (since := "1.2.0")] -protected abbrev isPc := isConnectorPunctuation - /-- Check if dash punctuation character (`Pd`) Unicode Property: `General_Category=Pd` -/ abbrev isDashPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pd -@[deprecated "c ∈ Unicode.GC.Pd" (since := "1.2.0")] -protected abbrev isPd := isDashPunctuation - /-- Check if grouping punctuation character (`PG`) This is a derived category (`PG = Ps | Pe`). @@ -327,25 +252,16 @@ protected abbrev isPd := isDashPunctuation Unicode Property: `General_Category=PG` -/ abbrev isGroupPunctuation (char : Char) : Bool := char ∈ Unicode.GC.PG -@[deprecated "c ∈ Unicode.GC.PG" (since := "1.2.0")] -protected abbrev isPG := isGroupPunctuation - /-- Check if open punctuation character (`Ps`) Unicode Property: `General_Category=Ps` -/ abbrev isOpenPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Ps -@[deprecated "c ∈ Unicode.GC.Ps" (since := "1.2.0")] -protected abbrev isPs := isOpenPunctuation - /-- Check if close punctuation character (`Pe`) Unicode Property: `General_Category=Pe` -/ abbrev isClosePunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pe -@[deprecated "c ∈ Unicode.GC.Pe" (since := "1.2.0")] -protected abbrev isPe := isClosePunctuation - /-- Check if quoting punctuation character (`PQ`) This is a derived category (`PQ = Pi | Pf`). @@ -353,33 +269,21 @@ protected abbrev isPe := isClosePunctuation Unicode Property: `General_Category=PQ` -/ abbrev isQuotePunctuation (char : Char) : Bool := char ∈ Unicode.GC.PQ -@[deprecated "c ∈ Unicode.GC.PQ" (since := "1.2.0")] -protected abbrev isPQ := isQuotePunctuation - /-- Check if initial punctuation character (`Pi`) Unicode Property: `General_Category=Pi` -/ abbrev isInitialPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pi -@[deprecated "c ∈ Unicode.GC.Pi" (since := "1.2.0")] -protected abbrev isPi := isInitialPunctuation - /-- Check if initial punctuation character (`Pf`) Unicode Property: `General_Category=Pf` -/ abbrev isFinalPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pf -@[deprecated "c ∈ Unicode.GC.Pf" (since := "1.2.0")] -protected abbrev isPf := isFinalPunctuation - /-- Check if other punctuation character (`Po`) Unicode Property: `General_Category=Po` -/ abbrev isOtherPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Po -@[deprecated "c ∈ Unicode.GC.Po" (since := "1.2.0")] -protected abbrev isPo := isOtherPunctuation - /-- Check if symbol character (`S`) This is a derived category (`S = Sm | Sc | Sk | So`). @@ -387,41 +291,26 @@ protected abbrev isPo := isOtherPunctuation Unicode Property: `General_Category=S` -/ abbrev isSymbol (char : Char) : Bool := char ∈ Unicode.GC.S -@[deprecated "c ∈ Unicode.GC.S" (since := "1.2.0")] -protected abbrev isS := isSymbol - /-- Check if math symbol character (`Sm`) Unicode Property: `General_Category=Sm` -/ abbrev isMathSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sm -@[deprecated "c ∈ Unicode.GC.Sm" (since := "1.2.0")] -protected abbrev isSm := isMathSymbol - /-- Check if currency symbol character (`Sc`) Unicode Property: `General_Category=Sc` -/ abbrev isCurrencySymbol (char : Char) : Bool := char ∈ Unicode.GC.Sc -@[deprecated "c ∈ Unicode.GC.Sc" (since := "1.2.0")] -protected abbrev isSc := isCurrencySymbol - /-- Check if modifier symbol character (`Sk`) Unicode Property: `General_Category=Sk` -/ abbrev isModifierSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sk -@[deprecated "c ∈ Unicode.GC.Sk" (since := "1.2.0")] -protected abbrev isSk := isModifierSymbol - /-- Check if other symbol character (`So`) Unicode Property: `General_Category=So` -/ abbrev isOtherSymbol (char : Char) : Bool := char ∈ Unicode.GC.So -@[deprecated "c ∈ Unicode.GC.So" (since := "1.2.0")] -protected abbrev isSo := isOtherSymbol - /-- Check if separator character (`Z`) This is a derived property (`Z = Zs | Zl | Zp`). @@ -429,33 +318,21 @@ protected abbrev isSo := isOtherSymbol Unicode Property: `General_Category=Z` -/ abbrev isSeparator (char : Char) : Bool := char ∈ Unicode.GC.Z -@[deprecated "c ∈ Unicode.GC.Z" (since := "1.2.0")] -protected abbrev isZ := isSeparator - /-- Check if space separator character (`Zs`) Unicode Property: `General_Category=Zs` -/ abbrev isSpaceSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zs -@[deprecated "c ∈ Unicode.GC.Zs" (since := "1.2.0")] -protected abbrev isZs := isSpaceSeparator - /-- Check if line separator character (`Zl`) Unicode Property: `General_Category=Zl` -/ abbrev isLineSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zl -@[deprecated "c ∈ Unicode.GC.Zl" (since := "1.2.0")] -protected abbrev isZl := isLineSeparator - /-- Check if paragraph separator character (`Zp`) Unicode Property: `General_Category=Zp` -/ abbrev isParagraphSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zp -@[deprecated "c ∈ Unicode.GC.Zp" (since := "1.2.0")] -protected abbrev isZp := isParagraphSeparator - /-- Check if other character (`C`) This is a derived category (`C = Cc | Cf | Cs | Co | Cn`). @@ -463,25 +340,16 @@ protected abbrev isZp := isParagraphSeparator Unicode Property: `General_Category=C` -/ abbrev isOther (char : Char) : Bool := char ∈ Unicode.GC.C -@[deprecated "c ∈ Unicode.GC.C" (since := "1.2.0")] -protected abbrev isC := isOther - /-- Check if control character (`Cc`) Unicode Property: `General_Category=Cc` -/ abbrev isControl (char : Char) : Bool := char ∈ Unicode.GC.Cc -@[deprecated "c ∈ Unicode.GC.Cc" (since := "1.2.0")] -protected abbrev isCc := isControl - /-- Check if format character (`Cf`) Unicode Property: `General_Category=Cf` -/ abbrev isFormat (char : Char) : Bool := char ∈ Unicode.GC.Cf -@[deprecated "c ∈ Unicode.GC.Cf" (since := "1.2.0")] -protected abbrev isCf := isFormat - /-- Check if surrogate character (`Cs`) Does not actually occur since Lean does not regard surrogate code points as characters. @@ -489,25 +357,16 @@ protected abbrev isCf := isFormat Unicode Property: `General_Category=Cs` -/ abbrev isSurrogate (char : Char) : Bool := char ∈ Unicode.GC.Cs -@[deprecated "c ∈ Unicode.GC.Cs" (since := "1.2.0")] -protected abbrev isCs := isSurrogate - /-- Check if private use character (`Co`) Unicode Property: `General_Category=Co` -/ abbrev isPrivateUse (char : Char) : Bool := char ∈ Unicode.GC.Co -@[deprecated "c ∈ Unicode.GC.Co" (since := "1.2.0")] -protected abbrev isCo := isPrivateUse - /-- Check if unassigned character (`Cn`) Unicode Property: `General_Category=Cn` -/ abbrev isUnassigned (char : Char) : Bool := char ∈ Unicode.GC.Cn -@[deprecated "c ∈ Unicode.GC.Cn" (since := "1.2.0")] -protected abbrev isCn := isUnassigned - end GeneralCategory /-! diff --git a/UnicodeBasic/TableLookup.lean b/UnicodeBasic/TableLookup.lean index 0251d7cde..f93606630 100644 --- a/UnicodeBasic/TableLookup.lean +++ b/UnicodeBasic/TableLookup.lean @@ -185,11 +185,6 @@ where @[inline] def lookupGC (c : UInt32) : GC := CLib.lookupProp c |>.toUInt32 -set_option linter.deprecated false in -@[inline, deprecated Unicode.lookupGC (since := "v1.3.0")] -def lookupGeneralCategory (c : UInt32) : GeneralCategory := - .ofGC! (lookupGC c) - /-- Get name of a code point using lookup table Unicode property: `Name` -/ diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index e665c4a60..8c179d658 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -486,249 +486,6 @@ def ofString! (s : String.Slice) : GC := end GC -set_option linter.deprecated false in -@[deprecated Unicode.GC (since := "1.2.0")] -structure GeneralCategory : Type where - /-- Major general category of a code point -/ - major : MajorGeneralCategory - /-- Minor general category of a code point -/ - minor : Option (MinorGeneralCategory major) -deriving Inhabited, DecidableEq - -set_option linter.deprecated false in section - -/-- General category: letter (`L`) -/ -@[deprecated Unicode.GC.L (since := "1.2.0")] -protected def GeneralCategory.L : GeneralCategory := ⟨.letter, none⟩ -/-- General category: cased letter (`LC`) -/ -@[deprecated Unicode.GC.LC (since := "1.2.0")] -protected def GeneralCategory.LC : GeneralCategory := ⟨_, some .casedLetter⟩ -/-- General category: uppercase letter (`Lu`) -/ -@[deprecated Unicode.GC.Lu (since := "1.2.0")] -protected def GeneralCategory.Lu : GeneralCategory := ⟨_, some .uppercaseLetter⟩ -/-- General category: lowercase letter (`Ll`) -/ -@[deprecated Unicode.GC.Ll (since := "1.2.0")] -protected def GeneralCategory.Ll : GeneralCategory := ⟨_, some .lowercaseLetter⟩ -/-- General category: titlecase letter (`Lt`) -/ -@[deprecated Unicode.GC.Lt (since := "1.2.0")] -protected def GeneralCategory.Lt : GeneralCategory := ⟨_, some .titlecaseLetter⟩ -/-- General category: modifier letter (`Lm`) -/ -@[deprecated Unicode.GC.Lm (since := "1.2.0")] -protected def GeneralCategory.Lm : GeneralCategory := ⟨_, some .modifierLetter⟩ -/-- General category: other letter (`Lo`) -/ -@[deprecated Unicode.GC.Lo (since := "1.2.0")] -protected def GeneralCategory.Lo : GeneralCategory := ⟨_, some .otherLetter⟩ -/-- General category mark (`M`) -/ -@[deprecated Unicode.GC.M (since := "1.2.0")] -protected def GeneralCategory.M : GeneralCategory := ⟨.mark, none⟩ -/-- General category: nonspacing combining mark (`Mn`) -/ -@[deprecated Unicode.GC.Mn (since := "1.2.0")] -protected def GeneralCategory.Mn : GeneralCategory := ⟨_, some .nonspacingMark⟩ -/-- General category: spacing combining mark (`Mc`) -/ -@[deprecated Unicode.GC.Mc (since := "1.2.0")] -protected def GeneralCategory.Mc : GeneralCategory := ⟨_, some .spacingMark⟩ -/-- General category: enclosing combining mark (`Me`) -/ -@[deprecated Unicode.GC.Me (since := "1.2.0")] -protected def GeneralCategory.Me : GeneralCategory := ⟨_, some .enclosingMark⟩ -/-- General category: number (`N`) -/ -@[deprecated Unicode.GC.N (since := "1.2.0")] -protected def GeneralCategory.N : GeneralCategory := ⟨.number, none⟩ -/-- General category: decimal digit (`Nd`) -/ -@[deprecated Unicode.GC.Nd (since := "1.2.0")] -protected def GeneralCategory.Nd : GeneralCategory := ⟨_, some .decimalNumber⟩ -/-- General category: letter number (`Nl`) -/ -@[deprecated Unicode.GC.Nl (since := "1.2.0")] -protected def GeneralCategory.Nl : GeneralCategory := ⟨_, some .letterNumber⟩ -/-- General category: other number (`No`) -/ -@[deprecated Unicode.GC.No (since := "1.2.0")] -protected def GeneralCategory.No : GeneralCategory := ⟨_, some .otherNumber⟩ -/-- General category: punctuation (`P`) -/ -@[deprecated Unicode.GC.P (since := "1.2.0")] -protected def GeneralCategory.P : GeneralCategory := ⟨.punctuation, none⟩ -/-- General category: connector punctuation (`Pc`) -/ -@[deprecated Unicode.GC.Pc (since := "1.2.0")] -protected def GeneralCategory.Pc : GeneralCategory := ⟨_, some .connectorPunctuation⟩ -/-- General category: dash punctuation (`Pd`) -/ -@[deprecated Unicode.GC.Pd (since := "1.2.0")] -protected def GeneralCategory.Pd : GeneralCategory := ⟨_, some .dashPunctuation⟩ -/-- General category: grouping punctuation (`PG`) -/ -@[deprecated Unicode.GC.PG (since := "1.2.0")] -protected def GeneralCategory.PG : GeneralCategory := ⟨_, some .groupPunctuation⟩ -/-- General category: opening punctuation (`Ps`) -/ -@[deprecated Unicode.GC.Ps (since := "1.2.0")] -protected def GeneralCategory.Ps : GeneralCategory := ⟨_, some .openPunctuation⟩ -/-- General category: closing punctuation (`Pe`) -/ -@[deprecated Unicode.GC.Pe (since := "1.2.0")] -protected def GeneralCategory.Pe : GeneralCategory := ⟨_, some .closePunctuation⟩ -/-- General category: quoting punctuation (`PQ`) -/ -@[deprecated Unicode.GC.PQ (since := "1.2.0")] -protected def GeneralCategory.PQ : GeneralCategory := ⟨_, some .quotePunctuation⟩ -/-- General category: initial punctuation (`Pi`) -/ -@[deprecated Unicode.GC.Pi (since := "1.2.0")] -protected def GeneralCategory.Pi : GeneralCategory := ⟨_, some .initialPunctuation⟩ -/-- General category: final punctuation (`Pf`) -/ -@[deprecated Unicode.GC.Pf (since := "1.2.0")] -protected def GeneralCategory.Pf : GeneralCategory := ⟨_, some .finalPunctuation⟩ -/-- General category: other punctuation (`Po`) -/ -@[deprecated Unicode.GC.Po (since := "1.2.0")] -protected def GeneralCategory.Po : GeneralCategory := ⟨_, some .otherPunctuation⟩ -/-- General category: symbol (`S`) -/ -@[deprecated Unicode.GC.S (since := "1.2.0")] -protected def GeneralCategory.S : GeneralCategory := ⟨.symbol, none⟩ -/-- General category: math symbol (`Sm`) -/ -@[deprecated Unicode.GC.Sm (since := "1.2.0")] -protected def GeneralCategory.Sm : GeneralCategory := ⟨_, some .mathSymbol⟩ -/-- General category: currency symbol (`Sc`) -/ -@[deprecated Unicode.GC.Sc (since := "1.2.0")] -protected def GeneralCategory.Sc : GeneralCategory := ⟨_, some .currencySymbol⟩ -/-- General category: modifier symbol (`Sk`) -/ -@[deprecated Unicode.GC.Sk (since := "1.2.0")] -protected def GeneralCategory.Sk : GeneralCategory := ⟨_, some .modifierSymbol⟩ -/-- General category: other symbol (`So`) -/ -@[deprecated Unicode.GC.So (since := "1.2.0")] -protected def GeneralCategory.So : GeneralCategory := ⟨_, some .otherSymbol⟩ -/-- General category: separator (`Z`) -/ -@[deprecated Unicode.GC.Z (since := "1.2.0")] -protected def GeneralCategory.Z : GeneralCategory := ⟨.separator, none⟩ -/-- General category: space separator (`Zs`) -/ -@[deprecated Unicode.GC.Zs (since := "1.2.0")] -protected def GeneralCategory.Zs : GeneralCategory := ⟨_, some .spaceSeparator⟩ -/-- General category: line separator (`Zl`) -/ -@[deprecated Unicode.GC.Zl (since := "1.2.0")] -protected def GeneralCategory.Zl : GeneralCategory := ⟨_, some .lineSeparator⟩ -/-- General category: paragraph separator (`Zp`) -/ -@[deprecated Unicode.GC.Zp (since := "1.2.0")] -protected def GeneralCategory.Zp : GeneralCategory := ⟨_, some .paragraphSeparator⟩ -/-- General category: other (`C`) -/ -@[deprecated Unicode.GC.C (since := "1.2.0")] -protected def GeneralCategory.C : GeneralCategory := ⟨.other, none⟩ -/-- General category: control (`Cc`) -/ -@[deprecated Unicode.GC.Cc (since := "1.2.0")] -protected def GeneralCategory.Cc : GeneralCategory := ⟨_, some .control⟩ -/-- General category: format (`Cf`) -/ -@[deprecated Unicode.GC.Cf (since := "1.2.0")] -protected def GeneralCategory.Cf : GeneralCategory := ⟨_, some .format⟩ -/-- General category: surrogate (`Cs`) -/ -@[deprecated Unicode.GC.Cs (since := "1.2.0")] -protected def GeneralCategory.Cs : GeneralCategory := ⟨_, some .surrogate⟩ -/-- General category: private use (`Co`) -/ -@[deprecated Unicode.GC.Co (since := "1.2.0")] -protected def GeneralCategory.Co : GeneralCategory := ⟨_, some .privateUse⟩ -/-- General category: unassigned (`Cn`) -/ -@[deprecated Unicode.GC.Cn (since := "1.2.0")] -protected def GeneralCategory.Cn : GeneralCategory := ⟨_, some .unassigned⟩ - -def GeneralCategory.toGC : GeneralCategory → GC -| ⟨.letter, none⟩ => .L -| ⟨_, some .casedLetter⟩ => .LC -| ⟨_, some .uppercaseLetter⟩ => .Lu -| ⟨_, some .lowercaseLetter⟩ => .Ll -| ⟨_, some .titlecaseLetter⟩ => .Lt -| ⟨_, some .modifierLetter⟩ => .Lm -| ⟨_, some .otherLetter⟩ => .Lo -| ⟨.mark, none⟩ => .M -| ⟨_, some .nonspacingMark⟩ => .Mn -| ⟨_, some .spacingMark⟩ => .Mc -| ⟨_, some .enclosingMark⟩ => .Me -| ⟨.number, none⟩ => .N -| ⟨_, some .decimalNumber⟩ => .Nd -| ⟨_, some .letterNumber⟩ => .Nl -| ⟨_, some .otherNumber⟩ => .No -| ⟨.punctuation, none⟩ => .P -| ⟨_, some .connectorPunctuation⟩ => .Pc -| ⟨_, some .dashPunctuation⟩ => .Pd -| ⟨_, some .groupPunctuation⟩ => .PG -| ⟨_, some .openPunctuation⟩ => .Ps -| ⟨_, some .closePunctuation⟩ => .Pe -| ⟨_, some .quotePunctuation⟩ => .PQ -| ⟨_, some .initialPunctuation⟩ => .Pi -| ⟨_, some .finalPunctuation⟩ => .Pf -| ⟨_, some .otherPunctuation⟩ => .Po -| ⟨.symbol, none⟩ => .S -| ⟨_, some .mathSymbol⟩ => .Sm -| ⟨_, some .currencySymbol⟩ => .Sc -| ⟨_, some .modifierSymbol⟩ => .Sk -| ⟨_, some .otherSymbol⟩ => .So -| ⟨.separator, none⟩ => .Z -| ⟨_, some .spaceSeparator⟩ => .Zs -| ⟨_, some .lineSeparator⟩ => .Zl -| ⟨_, some .paragraphSeparator⟩ => .Zp -| ⟨.other, none⟩ => .C -| ⟨_, some .control⟩ => .Cc -| ⟨_, some .format⟩ => .Cf -| ⟨_, some .surrogate⟩ => .Cs -| ⟨_, some .privateUse⟩ => .Co -| ⟨_, some .unassigned⟩ => .Cn - -@[deprecated some (since := "1.2.0")] -def GeneralCategory.ofGC? (c : GC) : Option GeneralCategory := - if c == .C then some .C else - if c == .Cc then some .Cc else - if c == .Cf then some .Cf else - if c == .Cs then some .Cs else - if c == .Co then some .Co else - if c == .Cn then some .Cn else - if c == .L then some .L else - if c == .LC then some .LC else - if c == .Lu then some .Lu else - if c == .Ll then some .Ll else - if c == .Lt then some .Lt else - if c == .Lm then some .Lm else - if c == .Lo then some .Lo else - if c == .M then some .M else - if c == .Mn then some .Mn else - if c == .Mc then some .Mc else - if c == .Me then some .Me else - if c == .N then some .N else - if c == .Nd then some .Nd else - if c == .Nl then some .Nl else - if c == .No then some .No else - if c == .P then some .P else - if c == .PG then some .PG else - if c == .PQ then some .PQ else - if c == .Pc then some .Pc else - if c == .Pd then some .Pd else - if c == .Ps then some .Ps else - if c == .Pe then some .Pe else - if c == .Pi then some .Pi else - if c == .Pf then some .Pf else - if c == .Po then some .Po else - if c == .S then some .S else - if c == .Sm then some .Sm else - if c == .Sc then some .Sc else - if c == .Sk then some .Sk else - if c == .So then some .So else - if c == .Z then some .Z else - if c == .Zs then some .Zs else - if c == .Zl then some .Zl else - if c == .Zp then some .Zp else - none - -@[deprecated id (since := "1.2.0")] -def GeneralCategory.ofGC! (c : GC) : GeneralCategory := - (ofGC? c).get! - -/-- String abbreviation for general category -/ -@[deprecated Unicode.GC.toAbbrev! (since := "1.2.0")] -def GeneralCategory.toAbbrev (c : GeneralCategory) : String := - c.toGC.toAbbrev! - -/-- Get general category from string abbreviation -/ -@[deprecated Unicode.GC.ofAbbrev? (since := "1.2.0")] -def GeneralCategory.ofAbbrev? (s : String.Slice) : Option GeneralCategory := - GC.ofAbbrev? s >>= ofGC? - -@[deprecated Unicode.GC.ofAbbrev! (since := "1.2.0"), inherit_doc GeneralCategory.ofAbbrev?] -def GeneralCategory.ofAbbrev! (s : String.Slice) : GeneralCategory := - match ofAbbrev? s with - | some gc => gc - | none => panic! "invalid general category abbreviation" - -instance : Repr GeneralCategory where - reprPrec gc _ := s!"Unicode.GeneralCategory.{gc.toAbbrev}" - -end - /-! ## Numeric Type and Value ## -/ diff --git a/UnicodeData/Basic.lean b/UnicodeData/Basic.lean index 9d443c4b3..63cfdf0cf 100644 --- a/UnicodeData/Basic.lean +++ b/UnicodeData/Basic.lean @@ -37,34 +37,6 @@ structure UnicodeData where titlecase : Option Char := none deriving BEq -@[deprecated UnicodeData.code (since := "1.2.0")] -abbrev UnicodeData.codeValue := @UnicodeData.code - -@[deprecated UnicodeData.name (since := "1.2.0")] -abbrev UnicodeData.characterName := @UnicodeData.name - -set_option linter.deprecated false in -@[deprecated UnicodeData.gc (since := "1.2.0")] -def UnicodeData.generalCategory (d : UnicodeData) : GeneralCategory := .ofGC! d.gc - -@[deprecated UnicodeData.bidi (since := "1.2.0")] -abbrev UnicodeData.bidiClass := @UnicodeData.bidi - -@[deprecated UnicodeData.cc (since := "1.2.0")] -abbrev UnicodeData.canonicalCombiningClass := @UnicodeData.cc - -@[deprecated UnicodeData.cc (since := "1.2.0")] -abbrev UnicodeData.decompositionMapping := @UnicodeData.decomp - -@[deprecated UnicodeData.lowercase (since := "1.2.0")] -abbrev UnicodeData.lowercaseMapping := @UnicodeData.lowercase - -@[deprecated UnicodeData.uppercase (since := "1.2.0")] -abbrev UnicodeData.uppercaseMapping := @UnicodeData.uppercase - -@[deprecated UnicodeData.titlecase (since := "1.2.0")] -abbrev UnicodeData.titlecaseMapping := @UnicodeData.titlecase - instance : Inhabited UnicodeData where default := { code := 0 From 6bcb6addbf4b4f234c9e0b2a8f1f0a49fe690fad Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 03:58:51 -0400 Subject: [PATCH 02/10] chore: cleanup UnicodeBasic --- UnicodeBasic.lean | 147 ++++----- UnicodeBasic/CharacterDatabase.lean | 18 +- UnicodeBasic/Hangul.lean | 60 ++-- UnicodeBasic/TableLookup.lean | 64 ++-- UnicodeBasic/Types.lean | 488 ++++++++++------------------ 5 files changed, 314 insertions(+), 463 deletions(-) diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index aad7dd988..415abfe03 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -6,8 +6,6 @@ module public import UnicodeBasic.Types public import UnicodeBasic.TableLookup -public section - /-! # General API # @@ -58,7 +56,7 @@ namespace Unicode Unicode property: `Name` -/ @[inline] -def getName (char : Char) : String := lookupName char.val +public def getName (char : Char) : String := lookupName char.val /-! ## Script ## @@ -69,7 +67,7 @@ def getName (char : Char) : String := lookupName char.val Unicode property: `Script` -/ @[inline] -def getScript (char : Char) : Script := lookupScript char.val +public def getScript (char : Char) : Script := lookupScript char.val /-- Get script name @@ -78,7 +76,7 @@ def getScript (char : Char) : Script := lookupScript char.val Unicode property: `Script` -/ @[inline] -def getScriptName? (s : Script) : Option String := +public def getScriptName? (s : Script) : Option String := lookupScriptName s |>.map toString /-! @@ -89,19 +87,19 @@ def getScriptName? (s : Script) : Option String := Unicode property: `Bidi_Class` -/ @[inline] -def getBidiClass (char : Char) : BidiClass := lookupBidiClass char.val +public def getBidiClass (char : Char) : BidiClass := lookupBidiClass char.val /-- Check if bidirectional mirrored character Unicode property: `Bidi_Mirrored` -/ @[inline] -def isBidiMirrored (char : Char) : Bool := lookupBidiMirrored char.val +public def isBidiMirrored (char : Char) : Bool := lookupBidiMirrored char.val /-- Check if bidirectional control character Unicode property: `Bidi_Control` -/ @[inline] -def isBidiControl (char : Char) : Bool := +public def isBidiControl (char : Char) : Bool := -- Extracted from `PropList.txt` char.val == 0x061C || char.val <= 0x200F && char.val >= 0x200E @@ -120,7 +118,7 @@ def isBidiControl (char : Char) : Bool := Unicode property: `General_Category` -/ @[inline] -def getGC (char : Char) : GC := +public def getGC (char : Char) : GC := -- ASCII shortcut if h : char.toNat < table.size then table[char.toNat] @@ -137,11 +135,10 @@ where .Sk, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ll, .Ps, .Sm, .Po, .Sm, .Cc] -instance : Membership Char GC where +public instance : Membership Char GC where mem cat char := getGC char ⊆ cat -instance (char : Char) (cat : GC) : Decidable (char ∈ cat) := inferInstanceAs (Decidable (_ ⊆ _)) - +public instance (char : Char) (cat : GC) : Decidable (char ∈ cat) := inferInstanceAs (Decidable (_ ⊆ _)) namespace GeneralCategory @@ -150,222 +147,222 @@ namespace GeneralCategory This is a derived category (`L = Lu | Ll | Lt | Lm | Lo`). Unicode Property: `General_Category=L` -/ -abbrev isLetter (char : Char) : Bool := char ∈ Unicode.GC.L +public abbrev isLetter (char : Char) : Bool := char ∈ Unicode.GC.L /-- Check if lowercase letter character (`Ll`) Unicode Property: `General_Category=Ll` -/ -abbrev isLowercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Ll +public abbrev isLowercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Ll /-- Check if titlecase letter character (`Lt`) Unicode Property: `General_Category=Lt` -/ -abbrev isTitlecaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lt +public abbrev isTitlecaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lt /-- Check if uppercase letter character (`Lu`) Unicode Property: `General_Category=Lu` -/ -abbrev isUppercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lu +public abbrev isUppercaseLetter (char : Char) : Bool := char ∈ Unicode.GC.Lu /-- Check if cased letter character (`LC`) This is a derived category (`L = Lu | Ll | Lt`). Unicode Property: `General_Category=LC` -/ -abbrev isCasedLetter (char : Char) : Bool := char ∈ Unicode.GC.LC +public abbrev isCasedLetter (char : Char) : Bool := char ∈ Unicode.GC.LC /-- Check if modifier letter character (`Lm`) Unicode Property: `General_Category=Lm`-/ -abbrev isModifierLetter (char : Char) : Bool := char ∈ Unicode.GC.Lm +public abbrev isModifierLetter (char : Char) : Bool := char ∈ Unicode.GC.Lm /-- Check if other letter character (`Lo`) Unicode Property: `General_Category=Lo`-/ -abbrev isOtherLetter (char : Char) : Bool := char ∈ Unicode.GC.Lo +public abbrev isOtherLetter (char : Char) : Bool := char ∈ Unicode.GC.Lo /-- Check if mark character (`M`) This is a derived category (`M = Mn | Mc | Me`). Unicode Property: `General_Category=M` -/ -abbrev isMark (char : Char) : Bool := char ∈ Unicode.GC.M +public abbrev isMark (char : Char) : Bool := char ∈ Unicode.GC.M /-- Check if nonspacing combining mark character (`Mn`) Unicode Property: `General_Category=Mn` -/ -abbrev isNonspacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mn +public abbrev isNonspacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mn /-- Check if spacing combining mark character (`Mc`) Unicode Property: `General_Category=Mc` -/ -abbrev isSpacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mc +public abbrev isSpacingMark (char : Char) : Bool := char ∈ Unicode.GC.Mc /-- Check if enclosing combining mark character (`Me`) Unicode Property: `General_Category=Me` -/ -abbrev isEnclosingMark (char : Char) : Bool := char ∈ Unicode.GC.Me +public abbrev isEnclosingMark (char : Char) : Bool := char ∈ Unicode.GC.Me /-- Check if number character (`N`) This is a derived category (`N = Nd | Nl | No`). Unicode Property: `General_Category=N` -/ -abbrev isNumber (char : Char) : Bool := char ∈ Unicode.GC.N +public abbrev isNumber (char : Char) : Bool := char ∈ Unicode.GC.N /-- Check if decimal number character (`Nd`) Unicode Property: `General_Category=Nd` -/ -abbrev isDecimalNumber (char : Char) : Bool := char ∈ Unicode.GC.Nd +public abbrev isDecimalNumber (char : Char) : Bool := char ∈ Unicode.GC.Nd /-- Check if letter number character (`Nl`) Unicode Property: `General_Category=Nl` -/ -abbrev isLetterNumber (char : Char) : Bool := char ∈ Unicode.GC.Nl +public abbrev isLetterNumber (char : Char) : Bool := char ∈ Unicode.GC.Nl /-- Check if other number character (`No`) Unicode Property: `General_Category=No` -/ -abbrev isOtherNumber (char : Char) : Bool := char ∈ Unicode.GC.No +public abbrev isOtherNumber (char : Char) : Bool := char ∈ Unicode.GC.No /-- Check if punctuation character (`P`) This is a derived category (`P = Pc | Pd | Ps | Pe | Pi | Pf | Po`). Unicode Property: `General_Category=P` -/ -abbrev isPunctuation (char : Char) : Bool := char ∈ Unicode.GC.P +public abbrev isPunctuation (char : Char) : Bool := char ∈ Unicode.GC.P /-- Check if connector punctuation character (`Pc`) Unicode Property: `General_Category=Pc` -/ -abbrev isConnectorPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pc +public abbrev isConnectorPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pc /-- Check if dash punctuation character (`Pd`) Unicode Property: `General_Category=Pd` -/ -abbrev isDashPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pd +public abbrev isDashPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pd /-- Check if grouping punctuation character (`PG`) This is a derived category (`PG = Ps | Pe`). Unicode Property: `General_Category=PG` -/ -abbrev isGroupPunctuation (char : Char) : Bool := char ∈ Unicode.GC.PG +public abbrev isGroupPunctuation (char : Char) : Bool := char ∈ Unicode.GC.PG /-- Check if open punctuation character (`Ps`) Unicode Property: `General_Category=Ps` -/ -abbrev isOpenPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Ps +public abbrev isOpenPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Ps /-- Check if close punctuation character (`Pe`) Unicode Property: `General_Category=Pe` -/ -abbrev isClosePunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pe +public abbrev isClosePunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pe /-- Check if quoting punctuation character (`PQ`) This is a derived category (`PQ = Pi | Pf`). Unicode Property: `General_Category=PQ` -/ -abbrev isQuotePunctuation (char : Char) : Bool := char ∈ Unicode.GC.PQ +public abbrev isQuotePunctuation (char : Char) : Bool := char ∈ Unicode.GC.PQ /-- Check if initial punctuation character (`Pi`) Unicode Property: `General_Category=Pi` -/ -abbrev isInitialPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pi +public abbrev isInitialPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pi /-- Check if initial punctuation character (`Pf`) Unicode Property: `General_Category=Pf` -/ -abbrev isFinalPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pf +public abbrev isFinalPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Pf /-- Check if other punctuation character (`Po`) Unicode Property: `General_Category=Po` -/ -abbrev isOtherPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Po +public abbrev isOtherPunctuation (char : Char) : Bool := char ∈ Unicode.GC.Po /-- Check if symbol character (`S`) This is a derived category (`S = Sm | Sc | Sk | So`). Unicode Property: `General_Category=S` -/ -abbrev isSymbol (char : Char) : Bool := char ∈ Unicode.GC.S +public abbrev isSymbol (char : Char) : Bool := char ∈ Unicode.GC.S /-- Check if math symbol character (`Sm`) Unicode Property: `General_Category=Sm` -/ -abbrev isMathSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sm +public abbrev isMathSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sm /-- Check if currency symbol character (`Sc`) Unicode Property: `General_Category=Sc` -/ -abbrev isCurrencySymbol (char : Char) : Bool := char ∈ Unicode.GC.Sc +public abbrev isCurrencySymbol (char : Char) : Bool := char ∈ Unicode.GC.Sc /-- Check if modifier symbol character (`Sk`) Unicode Property: `General_Category=Sk` -/ -abbrev isModifierSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sk +public abbrev isModifierSymbol (char : Char) : Bool := char ∈ Unicode.GC.Sk /-- Check if other symbol character (`So`) Unicode Property: `General_Category=So` -/ -abbrev isOtherSymbol (char : Char) : Bool := char ∈ Unicode.GC.So +public abbrev isOtherSymbol (char : Char) : Bool := char ∈ Unicode.GC.So /-- Check if separator character (`Z`) This is a derived property (`Z = Zs | Zl | Zp`). Unicode Property: `General_Category=Z` -/ -abbrev isSeparator (char : Char) : Bool := char ∈ Unicode.GC.Z +public abbrev isSeparator (char : Char) : Bool := char ∈ Unicode.GC.Z /-- Check if space separator character (`Zs`) Unicode Property: `General_Category=Zs` -/ -abbrev isSpaceSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zs +public abbrev isSpaceSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zs /-- Check if line separator character (`Zl`) Unicode Property: `General_Category=Zl` -/ -abbrev isLineSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zl +public abbrev isLineSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zl /-- Check if paragraph separator character (`Zp`) Unicode Property: `General_Category=Zp` -/ -abbrev isParagraphSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zp +public abbrev isParagraphSeparator (char : Char) : Bool := char ∈ Unicode.GC.Zp /-- Check if other character (`C`) This is a derived category (`C = Cc | Cf | Cs | Co | Cn`). Unicode Property: `General_Category=C` -/ -abbrev isOther (char : Char) : Bool := char ∈ Unicode.GC.C +public abbrev isOther (char : Char) : Bool := char ∈ Unicode.GC.C /-- Check if control character (`Cc`) Unicode Property: `General_Category=Cc` -/ -abbrev isControl (char : Char) : Bool := char ∈ Unicode.GC.Cc +public abbrev isControl (char : Char) : Bool := char ∈ Unicode.GC.Cc /-- Check if format character (`Cf`) Unicode Property: `General_Category=Cf` -/ -abbrev isFormat (char : Char) : Bool := char ∈ Unicode.GC.Cf +public abbrev isFormat (char : Char) : Bool := char ∈ Unicode.GC.Cf /-- Check if surrogate character (`Cs`) Does not actually occur since Lean does not regard surrogate code points as characters. Unicode Property: `General_Category=Cs` -/ -abbrev isSurrogate (char : Char) : Bool := char ∈ Unicode.GC.Cs +public abbrev isSurrogate (char : Char) : Bool := char ∈ Unicode.GC.Cs /-- Check if private use character (`Co`) Unicode Property: `General_Category=Co` -/ -abbrev isPrivateUse (char : Char) : Bool := char ∈ Unicode.GC.Co +public abbrev isPrivateUse (char : Char) : Bool := char ∈ Unicode.GC.Co /-- Check if unassigned character (`Cn`) Unicode Property: `General_Category=Cn` -/ -abbrev isUnassigned (char : Char) : Bool := char ∈ Unicode.GC.Cn +public abbrev isUnassigned (char : Char) : Bool := char ∈ Unicode.GC.Cn end GeneralCategory @@ -379,7 +376,7 @@ end GeneralCategory Unicode property: `Lowercase` -/ @[inline] -def isLowercase (char : Char) : Bool := +public def isLowercase (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then 'a' ≤ char && char ≤ 'z' @@ -392,7 +389,7 @@ def isLowercase (char : Char) : Bool := Unicode property: `Uppercase` -/ @[inline] -def isUppercase (char : Char) : Bool := +public def isUppercase (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then 'A' ≤ char && char ≤ 'Z' @@ -405,7 +402,7 @@ def isUppercase (char : Char) : Bool := Unicode property: `Cased` -/ @[inline] -def isCased (char : Char) : Bool := +public def isCased (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then 'A' ≤ char && char ≤ 'Z' || 'a' ≤ char && char ≤ 'z' @@ -419,7 +416,7 @@ def isCased (char : Char) : Bool := Unicode property: `Case_Ignorable` -/ @[inline] -def isCaseIgnorable (char : Char) : Bool := +public def isCaseIgnorable (char : Char) : Bool := char ∈ Unicode.GC.Lm ||| GC.Mn ||| GC.Sk ||| GC.Cf || other.elem char.val where /-- Auxiliary data for `isCaseIgnorable` @@ -451,7 +448,7 @@ where Unicode property: `Simple_Lowercase_Mapping` -/ @[inline] -def getLowerChar (char : Char) : Char := +public def getLowerChar (char : Char) : Char := -- ASCII shortcut if char.val < 0x80 then if 'A' ≤ char && char ≤ 'Z' then @@ -469,7 +466,7 @@ def getLowerChar (char : Char) : Char := Unicode property: `Simple_Uppercase_Mapping` -/ @[inline] -def getUpperChar (char : Char) : Char := +public def getUpperChar (char : Char) : Char := if char.val < 0x80 then if 'a' ≤ char && char ≤ 'z' then Char.ofNat (char.val - 0x20).toNat @@ -486,7 +483,7 @@ def getUpperChar (char : Char) : Char := Unicode property: `Simple_Titlecase_Mapping` -/ @[inline] -def getTitleChar (char : Char) : Char := +public def getTitleChar (char : Char) : Char := if char.val < 0x80 then if 'a' ≤ char && char ≤ 'z' then Char.ofNat (char.val - 0x20).toNat @@ -504,7 +501,7 @@ def getTitleChar (char : Char) : Char := Unicode property: `Canonical_Combining_Class` -/ -def getCanonicalCombiningClass (char : Char) : Nat := +public def getCanonicalCombiningClass (char : Char) : Nat := -- ASCII shortcut if char.val < 0x80 then 0 @@ -516,7 +513,7 @@ def getCanonicalCombiningClass (char : Char) : Nat := Unicode properties: `Decomposition_Mapping` `Decomposition_Type=Canonical` -/ -def getCanonicalDecomposition (char : Char) : String := +public def getCanonicalDecomposition (char : Char) : String := -- ASCII shortcut if char.val < 0x80 then char.toString else String.ofList <| (lookupCanonicalDecompositionMapping char.val).map fun c => Char.ofNat c.toNat @@ -529,7 +526,7 @@ def getCanonicalDecomposition (char : Char) : String := Unicode properties: `Decomposition_Type` `Decomposition_Mapping` -/ -def getDecompositionMapping? (char : Char) : Option DecompositionMapping := +public def getDecompositionMapping? (char : Char) : Option DecompositionMapping := -- ASCII shortcut if char.val < 0x80 then none @@ -544,7 +541,7 @@ def getDecompositionMapping? (char : Char) : Option DecompositionMapping := Unicode property: `Numeric_Type=Numeric` -/ @[inline] -def isNumeric (char : Char) : Bool := +public def isNumeric (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then char >= '0' && char <= '9' @@ -569,7 +566,7 @@ where Unicode property: `Numeric_Type=Digit` -/ @[inline] -def isDigit (char : Char) : Bool := +public def isDigit (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then char >= '0' && char <= '9' @@ -585,7 +582,7 @@ def isDigit (char : Char) : Bool := `Numeric_Type=Digit` `Numeric_Value` -/ @[inline] -def getDigit? (char : Char) : Option (Fin 10) := +public def getDigit? (char : Char) : Option (Fin 10) := -- ASCII shortcut if char.val < 0x80 then if char.toNat < '0'.toNat then @@ -609,7 +606,7 @@ def getDigit? (char : Char) : Option (Fin 10) := Unicode property: `Numeric_Type=Decimal` -/ @[inline] -def isDecimal (char : Char) : Bool := +public def isDecimal (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then char >= '0' && char <= '9' @@ -628,7 +625,7 @@ def isDecimal (char : Char) : Bool := `Numeric_Type=Decimal` `Numeric_Value` -/ @[inline] -def getDecimalRange? (char : Char) : Option (Char × Char) := +public def getDecimalRange? (char : Char) : Option (Char × Char) := -- ASCII shortcut if char.val < 0x80 then if char >= '0' && char <= '9' then @@ -646,7 +643,7 @@ def getDecimalRange? (char : Char) : Option (Char × Char) := Unicode property: `Hex_Digit` -/ @[inline] -def isHexDigit (char : Char) : Bool := +public def isHexDigit (char : Char) : Bool := -- Extracted from `PropList.txt` char.val <= 0x0039 && char.val >= 0x0030 || -- [10] DIGIT ZERO..DIGIT NINE char.val <= 0x0046 && char.val >= 0x0041 || -- [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F @@ -659,7 +656,7 @@ def isHexDigit (char : Char) : Bool := Unicode property: `Hex_Digit` -/ @[inline] -def getHexDigit? (char : Char) : Option (Fin 16) := +public def getHexDigit? (char : Char) : Option (Fin 16) := if char.toNat < 0x30 then none else @@ -689,7 +686,7 @@ def getHexDigit? (char : Char) : Option (Fin 16) := Unicode property: `Noncharacter_Code_Point` -/ @[inline] -def isNoncharacterCodePoint (char : Char) : Bool := +public def isNoncharacterCodePoint (char : Char) : Bool := lookupNoncharacterCodePoint char.val /-- Check if ignorable character @@ -697,7 +694,7 @@ def isNoncharacterCodePoint (char : Char) : Bool := Unicode property: `Default_Ignorable_Code_Point` -/ @[inline] -def isDefaultIgnorableCodePoint (char : Char) : Bool := +public def isDefaultIgnorableCodePoint (char : Char) : Bool := lookupDefaultIgnorableCodePoint char.val /-- Check if white space character @@ -705,7 +702,7 @@ def isDefaultIgnorableCodePoint (char : Char) : Bool := Unicode property: `White_Space` -/ @[inline] -def isWhiteSpace (char : Char) : Bool := +public def isWhiteSpace (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then char == ' ' || char >= '\t' && char <= '\r' @@ -718,7 +715,7 @@ def isWhiteSpace (char : Char) : Bool := Unicode property: `Math` -/ @[inline] -def isMath (char : Char) : Bool := lookupMath char.val +public def isMath (char : Char) : Bool := lookupMath char.val /-- Check if alphabetic character @@ -726,7 +723,7 @@ def isMath (char : Char) : Bool := lookupMath char.val Unicode property: `Alphabetic` -/ @[inline] -def isAlphabetic (char : Char) : Bool := +public def isAlphabetic (char : Char) : Bool := -- ASCII shortcut if char.val < 0x80 then 'A' ≤ char && char ≤ 'Z' || 'a' ≤ char && char ≤ 'z' @@ -734,6 +731,6 @@ def isAlphabetic (char : Char) : Bool := lookupAlphabetic char.val @[inherit_doc isAlphabetic] -abbrev isAlpha := isAlphabetic +public abbrev isAlpha := isAlphabetic end Unicode diff --git a/UnicodeBasic/CharacterDatabase.lean b/UnicodeBasic/CharacterDatabase.lean index fa22df1ed..6a1c8cf75 100644 --- a/UnicodeBasic/CharacterDatabase.lean +++ b/UnicodeBasic/CharacterDatabase.lean @@ -4,8 +4,6 @@ Released under Apache 2.0 license as described in the file LICENSE. -/ module -public section - namespace Unicode /-- Stream type for Unicode Character Database (UCD) files @@ -15,7 +13,7 @@ namespace Unicode around field values are not significant. Line comments are prefixed with a number sign `#` (U+0023). -/ -structure UCDStream extends String.Slice where +public structure UCDStream extends String.Slice where /-- `isUnihan` is true if the records are tab separated -/ isUnihan := false deriving Inhabited @@ -23,23 +21,23 @@ deriving Inhabited namespace UCDStream /-- Make a `UCDStream` from a string slice -/ -def ofStringSlice (str : String.Slice) : UCDStream := { str with } +public def ofStringSlice (str : String.Slice) : UCDStream := { str with } /-- Make a `UCDStream` from a string -/ -def ofString (str : String) : UCDStream := ofStringSlice str.toSlice +public def ofString (str : String) : UCDStream := ofStringSlice str.toSlice /-- Make a `UCDStream` from a substring -/ -def ofSubstring (str : Substring.Raw) : UCDStream := ofStringSlice str.toString.toSlice +public def ofSubstring (str : Substring.Raw) : UCDStream := ofStringSlice str.toString.toSlice /-- Make a `UCDStream` from a file -/ -def ofFile (path : System.FilePath) : IO UCDStream := +public def ofFile (path : System.FilePath) : IO UCDStream := ofString <$> IO.FS.readFile path /-- Get the next line from the `UCDStream` Line comments are stripped and blank lines are skipped. -/ -protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × UCDStream) := do +public protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × UCDStream) := do let line := stream.trimAsciiEnd.takeWhile (.!='\n') if h : line.rawEndPos < stream.rawEndPos then let nextPos := stream.posGT line.rawEndPos h @@ -54,7 +52,7 @@ protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × U Spaces around field values are trimmed. -/ -protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStream) := do +public protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStream) := do let sep := if stream.isUnihan then "\t" else ";" let mut arr := #[] let (line, table) ← stream.nextLine? @@ -62,5 +60,5 @@ protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStre arr := arr.push item.trimAscii return (arr, table) -instance : Std.Stream UCDStream (Array String.Slice) where +public instance : Std.Stream UCDStream (Array String.Slice) where next? := UCDStream.next? diff --git a/UnicodeBasic/Hangul.lean b/UnicodeBasic/Hangul.lean index b16b276dc..87de475ef 100644 --- a/UnicodeBasic/Hangul.lean +++ b/UnicodeBasic/Hangul.lean @@ -4,59 +4,57 @@ Released under Apache 2.0 license as described in the file LICENSE. -/ module -public section - namespace Unicode.Hangul -def shortNameL : Array String := +public def shortNameL : Array String := #["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"] -def shortNameV : Array String := +public def shortNameV : Array String := #["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"] -def shortNameT : Array String := +public def shortNameT : Array String := #["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"] -abbrev sizeL := shortNameL.size -- 19 -abbrev sizeV := shortNameV.size -- 21 -abbrev sizeT := shortNameT.size -- 28 -abbrev sizeLV := sizeL * sizeV -- 399 -abbrev sizeVT := sizeV * sizeT -- 588 -abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 +public abbrev sizeL := shortNameL.size -- 19 +public abbrev sizeV := shortNameV.size -- 21 +public abbrev sizeT := shortNameT.size -- 28 +public abbrev sizeLV := sizeL * sizeV -- 399 +public abbrev sizeVT := sizeV * sizeT -- 588 +public abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 -abbrev baseL : UInt32 := 0x1100 -abbrev baseV : UInt32 := 0x1161 -abbrev baseT : UInt32 := 0x11A7 +public abbrev baseL : UInt32 := 0x1100 +public abbrev baseV : UInt32 := 0x1161 +public abbrev baseT : UInt32 := 0x11A7 /-- Number of Hangul syllables -/ -def Syllable.size := sizeLVT +public def Syllable.size := sizeLVT /-- Starting code point for Hangul syllables -/ -def Syllable.base : UInt32 := 0xAC00 +public def Syllable.base : UInt32 := 0xAC00 /-- Stopping code point for Hangul syllables -/ -def Syllable.last : UInt32 := ⟨0xAC00 + Syllable.size - 1, by decide⟩ +public def Syllable.last : UInt32 := ⟨0xAC00 + Syllable.size - 1, by decide⟩ /-- Hangul syllable type -/ -structure Syllable where +public structure Syllable where /-- L part of syllable -/ - toL : Fin sizeL + public toL : Fin sizeL /-- V part of syllable -/ - toV : Fin sizeV + public toV : Fin sizeV /-- T part of syllable (0 means none) -/ - toT : Fin sizeT := ⟨0, by decide⟩ + public toT : Fin sizeT := ⟨0, by decide⟩ deriving DecidableEq, Repr -instance : Inhabited Syllable where +public instance : Inhabited Syllable where default := ⟨⟨0, by decide⟩, ⟨0, by decide⟩, ⟨0, by decide⟩⟩ /-- LV part of syllable -/ -def Syllable.toLV : Syllable → Fin sizeLV +public def Syllable.toLV : Syllable → Fin sizeLV | ⟨⟨l, hl⟩, ⟨v, hv⟩, _⟩ => have : l * sizeV + v < sizeLV := calc _ < l * sizeV + sizeV := by apply Nat.add_lt_add_left; exact hv @@ -64,7 +62,7 @@ def Syllable.toLV : Syllable → Fin sizeLV _ ≤ sizeL * sizeV := by apply Nat.mul_le_mul_right; exact hl ⟨l * sizeV + v, this⟩ -def Syllable.index (s : Syllable) : Fin Syllable.size := +public def Syllable.index (s : Syllable) : Fin Syllable.size := match s.toLV, s.toT with | ⟨lv, hlv⟩, ⟨t, ht⟩ => have : lv * sizeT + t < sizeL * sizeV * sizeT := calc @@ -75,27 +73,27 @@ def Syllable.index (s : Syllable) : Fin Syllable.size := /-- Get short name for Hangul syllable -/ @[inline] -def Syllable.getShortName (s : Syllable) : String := shortNameL[s.toL] ++ shortNameV[s.toV] ++ shortNameT[s.toT] +public def Syllable.getShortName (s : Syllable) : String := shortNameL[s.toL] ++ shortNameV[s.toV] ++ shortNameT[s.toT] /-- Get L part character for syllable -/ -def Syllable.getLChar (s : Syllable) : Char := +public def Syllable.getLChar (s : Syllable) : Char := .ofNat (baseL.toNat + s.toL) /-- Get V part character for syllable -/ -def Syllable.getVChar (s : Syllable) : Char := +public def Syllable.getVChar (s : Syllable) : Char := .ofNat (baseV.toNat + s.toV) /-- Get LV part character for syllable -/ -def Syllable.getLVChar (s : Syllable) : Char := +public def Syllable.getLVChar (s : Syllable) : Char := .ofNat (base.toNat + sizeT * s.toLV) /-- Get T part character for syllable -/ -def Syllable.getTChar? (s : Syllable) : Option Char := +public def Syllable.getTChar? (s : Syllable) : Option Char := if s.toT.val == 0 then none else return .ofNat (baseT.toNat + s.toT) /-- Get Hangul syllable from code point -/ -def getSyllable? (code : UInt32) : Option Syllable := +public def getSyllable? (code : UInt32) : Option Syllable := if code < 0xAC00 then none else let index := (code - 0xAC00).toNat if h : index ≥ Syllable.size then none else @@ -111,7 +109,7 @@ def getSyllable? (code : UInt32) : Option Syllable := some ⟨⟨lpart, lislt⟩, ⟨vpart, vislt⟩, ⟨tpart, tislt⟩⟩ @[inherit_doc getSyllable?] -def getSyllable! (code : UInt32) : Syllable := +public def getSyllable! (code : UInt32) : Syllable := match getSyllable? code with | some s => s | none => panic! "not a Hangul syllable" diff --git a/UnicodeBasic/TableLookup.lean b/UnicodeBasic/TableLookup.lean index f93606630..1dce3df30 100644 --- a/UnicodeBasic/TableLookup.lean +++ b/UnicodeBasic/TableLookup.lean @@ -7,8 +7,6 @@ import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul public import UnicodeBasic.Types -public section - namespace Unicode namespace CLib @@ -74,7 +72,7 @@ private def parsePropTable (s : String) : Thunk <| Array (UInt32 × UInt32) := I /-- Get bidirectional class using lookup table Unicode property: `Bidi_Class` -/ -def lookupBidiClass (c : UInt32) : BidiClass := +public def lookupBidiClass (c : UInt32) : BidiClass := let table := table.get if c < table[0]!.1 then .BN else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -87,7 +85,7 @@ where /-- Get canonical combining class using lookup table Unicode property: `Canonical_Combining_Class` -/ -def lookupCanonicalCombiningClass (c : UInt32) : Nat := +public def lookupCanonicalCombiningClass (c : UInt32) : Nat := let t := table.get if c < t[0]!.1 then 0 else match t[find c (fun i => t[i]!.1) 0 t.size.toUSize]! with @@ -102,7 +100,7 @@ where Unicode properties: `Decomposition_Mapping` `Decomposition_Type=Canonical` -/ -def lookupCanonicalDecompositionMapping (c : UInt32) : List UInt32 := +public def lookupCanonicalDecompositionMapping (c : UInt32) : List UInt32 := -- Hangul syllables if Hangul.Syllable.base ≤ c && c ≤ Hangul.Syllable.last then let s := Hangul.getSyllable! c @@ -125,7 +123,7 @@ where `Simple_Lowercase_Mapping` `Simple_Uppercase_Mapping` `Simple_Titlecase_Mapping` -/ -def lookupCaseMapping (c : UInt32) : UInt32 × UInt32 × UInt32 := +public def lookupCaseMapping (c : UInt32) : UInt32 × UInt32 × UInt32 := let v : UInt64 := CLib.lookupCase c if v == 0 then (c, c, c) else let cu : UInt32 := v.toUInt32 &&& 0x001FFFFF @@ -138,7 +136,7 @@ def lookupCaseMapping (c : UInt32) : UInt32 × UInt32 × UInt32 := Unicode properties: `Decomposition_Mapping` `Decomposition_Type` -/ -def lookupDecompositionMapping? (c : UInt32) : Option DecompositionMapping := +public def lookupDecompositionMapping? (c : UInt32) : Option DecompositionMapping := -- Hangul syllables if Hangul.Syllable.base ≤ c && c ≤ Hangul.Syllable.last then let s := Hangul.getSyllable! c @@ -183,12 +181,12 @@ where Unicode property: `General_Category` -/ @[inline] -def lookupGC (c : UInt32) : GC := CLib.lookupProp c |>.toUInt32 +public def lookupGC (c : UInt32) : GC := CLib.lookupProp c |>.toUInt32 /-- Get name of a code point using lookup table Unicode property: `Name` -/ -def lookupName (c : UInt32) : String := +public def lookupName (c : UInt32) : String := let table := table.get if c < table[0]!.1 then unreachable! else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -196,23 +194,23 @@ def lookupName (c : UInt32) : String := if c ≤ v then if Char.ofUInt8 (d.getUTF8Byte! 0) == '<' then if d == "" then - s!"" + s!"" else if d == "" then - s!"" + s!"" else if d == "" then - s!"" + s!"" else if d == "" then - s!"" + s!"" else if d == "" then - "CJK UNIFIED IDEOGRAPH-" ++ toHexStringAux c + "CJK UNIFIED IDEOGRAPH-" ++ toHexStringRaw c else if d == "" then - "CJK COMPATIBILITY IDEOGRAPH-" ++ toHexStringAux c + "CJK COMPATIBILITY IDEOGRAPH-" ++ toHexStringRaw c else if d == "" then "HANGUL SYLLABLE " ++ (Hangul.getSyllable! c).getShortName else if d == "" then - "KHITAN SMALL SCRIPT CHARACTER-" ++ toHexStringAux c + "KHITAN SMALL SCRIPT CHARACTER-" ++ toHexStringRaw c else if d == "" then - "NUSHU CHARACTER-" ++ toHexStringAux c + "NUSHU CHARACTER-" ++ toHexStringRaw c else if d == "" then let i := if c.toNat < 0x18B00 then -- Tangut Component @@ -226,10 +224,10 @@ def lookupName (c : UInt32) : String := else i "TANGUT COMPONENT-" ++ i else if d == "" then - "TANGUT IDEOGRAPH-" ++ toHexStringAux c + "TANGUT IDEOGRAPH-" ++ toHexStringRaw c else panic! s!"unknown name range {d.copy}" else String.Slice.copy d - else s!"" + else s!"" where str : String := include_str "../data/table/Name.txt" table : Thunk <| Array (UInt32 × UInt32 × String.Slice) := @@ -240,7 +238,7 @@ where Unicode properties: `Numeric_Type` `Numeric_Value` -/ -def lookupNumericValue (c : UInt32) : Option NumericType := +public def lookupNumericValue (c : UInt32) : Option NumericType := let table := table.get if c < table[0]!.1 then none else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -288,7 +286,7 @@ where /-- Get other properties using lookup table Unicode properties: `Other_Alphabetic`, `Other_Lowercase`, `Other_Uppercase`, `Other_Math` -/ -def lookupOther (c : UInt32) : UInt32 := +public def lookupOther (c : UInt32) : UInt32 := CLib.lookupProp c >>> 32 |>.toUInt32 /-! Properties -/ @@ -297,7 +295,7 @@ def lookupOther (c : UInt32) : UInt32 := Unicode property: `Alphabetic` -/ @[inline] -def lookupAlphabetic (c : UInt32) : Bool := +public def lookupAlphabetic (c : UInt32) : Bool := let m := CLib.oAlpha ||| (GC.L ||| GC.Nl).toUInt64 CLib.lookupProp c &&& m != 0 @@ -305,7 +303,7 @@ def lookupAlphabetic (c : UInt32) : Bool := Unicode property: `Bidi_Mirrored` -/ -def lookupBidiMirrored (c : UInt32) : Bool := +public def lookupBidiMirrored (c : UInt32) : Bool := let table := table.get if c < table[0]!.1 then false else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -318,7 +316,7 @@ where Unicode property: `Cased` -/ @[inline] -def lookupCased (c : UInt32 ) : Bool := +public def lookupCased (c : UInt32 ) : Bool := let m := CLib.oUpper ||| CLib.oLower ||| GC.LC.toUInt64 CLib.lookupProp c &&& m != 0 @@ -326,7 +324,7 @@ def lookupCased (c : UInt32 ) : Bool := Unicode property: `Default_Ignorable_Code_Point` -/ @[inline] -def lookupDefaultIgnorableCodePoint (c : UInt32 ) : Bool := +public def lookupDefaultIgnorableCodePoint (c : UInt32 ) : Bool := let table := table.get if c < table[0]!.1 then false else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -339,7 +337,7 @@ where Unicode property: `Lowercase` -/ @[inline] -def lookupLowercase (c : UInt32) : Bool := +public def lookupLowercase (c : UInt32) : Bool := let m := CLib.oLower ||| GC.Ll.toUInt64 CLib.lookupProp c &&& m != 0 @@ -348,7 +346,7 @@ def lookupLowercase (c : UInt32) : Bool := Unicode property: `Math` -/ @[inline] -def lookupMath (c : UInt32) : Bool := +public def lookupMath (c : UInt32) : Bool := let m := CLib.oMath ||| GC.Sm.toUInt64 CLib.lookupProp c &&& m != 0 @@ -356,28 +354,28 @@ def lookupMath (c : UInt32) : Bool := Unicode property: `Noncharacter_Code_Point` -/ @[inline] -def lookupNoncharacterCodePoint (c : UInt32) : Bool := +public def lookupNoncharacterCodePoint (c : UInt32) : Bool := (c ≤ 0xFDEF && 0xFDD0 ≤ c) || (c ≤ Unicode.max && c &&& 0xFFFE == 0xFFFE) /-- Check if code point is a titlecase letter using lookup table Unicode property: `Titlecase` -/ @[inline] -def lookupTitlecase (c : UInt32) : Bool := +public def lookupTitlecase (c : UInt32) : Bool := lookupGC c == GC.Lt /-- Check if code point is a uppercase letter using lookup table Unicode property: `Uppercase` -/ @[inline] -def lookupUppercase (c : UInt32) : Bool := +public def lookupUppercase (c : UInt32) : Bool := let m := CLib.oUpper ||| GC.Lu.toUInt64 CLib.lookupProp c &&& m != 0 /-- Check if code point is a white space character using lookup table Unicode property: `White_Space` -/ -def lookupWhiteSpace (c : UInt32) : Bool := +public def lookupWhiteSpace (c : UInt32) : Bool := let table := table.get if c < table[0]!.1 then false else match table[find c (fun i => table[i]!.1) 0 table.size.toUSize]! with @@ -390,12 +388,12 @@ where Unicode property: `Script` -/ @[inline] -def lookupScript (c : UInt32) : Script := CLib.lookupScript c +public def lookupScript (c : UInt32) : Script := CLib.lookupScript c /-- Get the name of a script Unicode property: `Script` -/ -def lookupScriptName (s : Script) : Option String.Slice := +public def lookupScriptName (s : Script) : Option String.Slice := let table := table.get if s.code < table[0]!.1 then none else match table[find s.code (fun i => table[i]!.1) 0 table.size.toUSize]! with diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index 8c179d658..fce044543 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -3,44 +3,49 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ module -public import Std.Data.HashMap - -public section +import Std.Data.HashMap /-- Low-level conversion from `UInt32` to `Char` (*unsafe*) This function translates to a no-op in the compiler. However, it does not check whether the `UInt32` code is a valid `Char` value. Only use where it's known for external reasons that the code is valid. -/ -protected unsafe def Char.mkUnsafe : UInt32 → Char := unsafeCast +public protected unsafe def Char.mkUnsafe : UInt32 → Char := unsafeCast namespace Unicode /-- Maximum valid code point value -/ -@[simp, grind =] protected abbrev max : UInt32 := 0x10FFFF +@[simp, grind =] +public protected abbrev max : UInt32 := 0x10FFFF /-- Minimum high surrogate code point -/ -@[simp, grind =] protected abbrev minHighSurrogate : UInt32 := 0xD800 +@[simp, grind =] +public protected abbrev minHighSurrogate : UInt32 := 0xD800 /-- Maximum high surrogate code point -/ -@[simp, grind =] protected abbrev maxHighSurrogate : UInt32 := 0xDBFF +@[simp, grind =] +public protected abbrev maxHighSurrogate : UInt32 := 0xDBFF /-- Minimum low surrogate code point -/ -@[simp, grind =] protected abbrev minLowSurrogate : UInt32 := 0xDC00 +@[simp, grind =] +public protected abbrev minLowSurrogate : UInt32 := 0xDC00 /-- Maximum low surrogate code point -/ -@[simp, grind =] protected abbrev maxLowSurrogate : UInt32 := 0xDFFF +@[simp, grind =] +public protected abbrev maxLowSurrogate : UInt32 := 0xDFFF /-- Minimum surrogate code point -/ -@[simp, grind =] protected abbrev minSurrogate : UInt32 := Unicode.minHighSurrogate +@[simp, grind =] +public protected abbrev minSurrogate : UInt32 := Unicode.minHighSurrogate /-- Minimum surrogate code point -/ -@[simp, grind =] protected abbrev maxSurrogate : UInt32 := Unicode.maxLowSurrogate +@[simp, grind =] +public protected abbrev maxSurrogate : UInt32 := Unicode.maxLowSurrogate -/-- Hexadecimal string representation of a code point +/-- Raw hexadecimal string representation of a code point Same as `toHexString` but without the `U+` prefix. -/ -def toHexStringAux (code : UInt32) : String := Id.run do +public def toHexStringRaw (code : UInt32) : String := Id.run do let hex := #['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'] let mut code := code let mut dgts := [] @@ -58,15 +63,15 @@ def toHexStringAux (code : UInt32) : String := Id.run do (e.g. `U+0123` and `U+4B0A1` but neither `U+123` nor `U+4b0a1`). -/ @[inline] -def toHexString (code : UInt32) : String := - "U+" ++ toHexStringAux code +public def toHexString (code : UInt32) : String := + "U+" ++ toHexStringRaw code /-- Get code point from hexadecimal string representation For convenience, the `U+` prefix may be omitted and lowercase hexadecimal digits are accepted. -/ -def ofHexString? (str : String.Slice) : Option UInt32 := do +public def ofHexString? (str : String.Slice) : Option UInt32 := do let str := if str.take 2 == "U+" then str.drop 2 else str if str.isEmpty || str.utf8ByteSize > 8 then none else let mut val : UInt32 := 0 @@ -76,7 +81,6 @@ def ofHexString? (str : String.Slice) : Option UInt32 := do where - /-- Get value of hex digit -/ @[inline] hexValue? (dgt : Char) : Option UInt32 := do if dgt.val < '0'.val then none else let mut n := dgt.val - '0'.val @@ -94,7 +98,7 @@ where none @[inherit_doc ofHexString?] -def ofHexString! (str : String.Slice) : UInt32 := +public def ofHexString! (str : String.Slice) : UInt32 := match ofHexString? str with | some val => val | none => panic! "invalid unicode hexadecimal string representation" @@ -103,221 +107,77 @@ def ofHexString! (str : String.Slice) : UInt32 := ## General Category ## -/ -/-- Major general category (`L`, `M`, `N`, `P`, `S`, `Z`, `C`) - - Unicode property: `General_Category` -/ -inductive MajorGeneralCategory : Type -/-- (`L`) Letter -/ -| letter -/-- (`M`) Mark -/ -| mark -/-- (`N`) Number -/ -| number -/-- (`P`) Punctuation -/ -| punctuation -/-- (`S`) Symbol -/ -| symbol -/-- (`Z`) Separator -/ -| separator -/-- (`C`) Other -/ -| other -deriving Inhabited, DecidableEq - -/-- String abbreviation for major general category -/ -def MajorGeneralCategory.toAbbrev : MajorGeneralCategory → String -| letter => "L" -| mark => "M" -| number => "N" -| punctuation => "P" -| symbol => "S" -| separator => "Z" -| other => "C" - -/-- Minor general category - - Unicode property: `General_Category` -/ -inductive MinorGeneralCategory : MajorGeneralCategory → Type -/-- (`LC`) cased letter (derived from `Lu | Ll | Lt`) -/ -| casedLetter : MinorGeneralCategory .letter -/-- (`Lu`) uppercase letter -/ -| uppercaseLetter : MinorGeneralCategory .letter -/-- (`Ll`) lowercase letter -/ -| lowercaseLetter : MinorGeneralCategory .letter -/-- (`Lt`) titlecase letter: digraphic character, with first part uppercase -/ -| titlecaseLetter : MinorGeneralCategory .letter -/-- (`Lm`) modifier letter -/ -| modifierLetter : MinorGeneralCategory .letter -/-- (`Lo`) other letters, including syllables and ideographs -/ -| otherLetter : MinorGeneralCategory .letter -/-- (`Mn`) nonspacing combining mark (zero advance width) -/ -| nonspacingMark : MinorGeneralCategory .mark -/-- (`Mc`) spacing combining mark (positive advance width) -/ -| spacingMark : MinorGeneralCategory .mark -/-- (`Me`) enclosing combining mark -/ -| enclosingMark : MinorGeneralCategory .mark -/-- (`Nd`) decimal digit -/ -| decimalNumber : MinorGeneralCategory .number -/-- (`Nl`) letter number: a letterlike numeric character -/ -| letterNumber : MinorGeneralCategory .number -/-- (`No`) numeric character of other type -/ -| otherNumber : MinorGeneralCategory .number -/-- (`Pc`) connecting punctuation mark, like a tie -/ -| connectorPunctuation : MinorGeneralCategory .punctuation -/-- (`Pd`) dash or hyphen punctuation mark -/ -| dashPunctuation : MinorGeneralCategory .punctuation -/-- (`PG`) grouping punctuation mark (derived from `Ps | Pe`) -/ -| groupPunctuation : MinorGeneralCategory .punctuation -/-- (`Ps`) opening punctuation mark (of a pair) -/ -| openPunctuation : MinorGeneralCategory .punctuation -/-- (`Pe`) closing punctuation mark (of a pair) -/ -| closePunctuation : MinorGeneralCategory .punctuation -/-- (`PQ`) quoting punctuation mark (derived from `Pi | Pf`) -/ -| quotePunctuation : MinorGeneralCategory .punctuation -/-- (`Pi`) initial quotation mark -/ -| initialPunctuation : MinorGeneralCategory .punctuation -/-- (`Pf`) final quotation mark -/ -| finalPunctuation : MinorGeneralCategory .punctuation -/-- (`Po`) punctuation mark of other type -/ -| otherPunctuation : MinorGeneralCategory .punctuation -/-- (`Sm`) symbol of mathematical use -/ -| mathSymbol : MinorGeneralCategory .symbol -/-- (`Sc`) currency sign -/ -| currencySymbol : MinorGeneralCategory .symbol -/-- (`Sk`) non-letterlike modifier symbol -/ -| modifierSymbol : MinorGeneralCategory .symbol -/-- (`So`) symbol of other type -/ -| otherSymbol : MinorGeneralCategory .symbol -/-- (`Zs`) space character (of various non-zero widths) -/ -| spaceSeparator : MinorGeneralCategory .separator -/-- (`Zl`) line separator (U+2028 LINE SEPARATOR only) -/ -| lineSeparator : MinorGeneralCategory .separator -/-- (`Zp`) paragraph separator (U+2029 PARAGRAPH SEPARATOR only) -/ -| paragraphSeparator : MinorGeneralCategory .separator -/-- (`Cc`) C0 or C1 control code -/ -| control : MinorGeneralCategory .other -/-- (`Cf`) format control character -/ -| format : MinorGeneralCategory .other -/-- (`Cs`) surrogate code point -/ -| surrogate : MinorGeneralCategory .other -/-- (`Co`) private-use character -/ -| privateUse : MinorGeneralCategory .other -/-- (`Cn`) reserved unassigned code point or a noncharacter -/ -| unassigned : MinorGeneralCategory .other -deriving DecidableEq - /-- General category (GC) Unicode property: `General_Category` -/ @[expose] -def GC := UInt32 deriving DecidableEq, Inhabited +public def GC := UInt32 deriving DecidableEq, Inhabited namespace GC -instance : OrOp GC := inferInstanceAs (OrOp UInt32) +public instance : OrOp GC := inferInstanceAs (OrOp UInt32) -instance : AndOp GC := inferInstanceAs (AndOp UInt32) +public instance : AndOp GC := inferInstanceAs (AndOp UInt32) -instance : Complement GC where +public instance : Complement GC where complement x := UInt32.xor x 0x3FFFFFFF -instance : HasSubset GC where +public instance : HasSubset GC where Subset x y := x &&& y == x -instance (x y : GC) : Decidable (x ⊆ y) := inferInstanceAs (Decidable (_ == _)) - -protected def none : GC := (0x00000000 : UInt32) -protected def univ : GC := (0x3FFFFFFF : UInt32) - -protected def Lu : GC := (0x00000001 : UInt32) -protected def Ll : GC := (0x00000002 : UInt32) -protected def Lt : GC := (0x00000004 : UInt32) -protected def Lm : GC := (0x00000008 : UInt32) -protected def Lo : GC := (0x00000010 : UInt32) -protected def LC : GC := .Lu ||| .Ll ||| .Lt -protected def L : GC := .Lu ||| .Ll ||| .Lt ||| .Lm ||| .Lo - -protected def Mn : GC := (0x00000020 : UInt32) -protected def Mc : GC := (0x00000040 : UInt32) -protected def Me : GC := (0x00000080 : UInt32) -protected def M : GC := .Mn ||| .Mc ||| .Me - -protected def Nd : GC := (0x00000100 : UInt32) -protected def Nl : GC := (0x00000200 : UInt32) -protected def No : GC := (0x00000400 : UInt32) -protected def N : GC := .Nd ||| .Nl ||| .No - -protected def Pc : GC := (0x00000800 : UInt32) -protected def Pd : GC := (0x00001000 : UInt32) -protected def Ps : GC := (0x00002000 : UInt32) -protected def Pe : GC := (0x00004000 : UInt32) -protected def Pi : GC := (0x00008000 : UInt32) -protected def Pf : GC := (0x00010000 : UInt32) -protected def Po : GC := (0x00020000 : UInt32) -protected def PG : GC := .Ps ||| .Pe -protected def PQ : GC := .Pi ||| .Pf -protected def P : GC := .Pc ||| .Pd ||| .Ps ||| .Pe ||| .Pi ||| .Pf ||| .Po - -protected def Sm : GC := (0x00040000 : UInt32) -protected def Sc : GC := (0x00080000 : UInt32) -protected def Sk : GC := (0x00100000 : UInt32) -protected def So : GC := (0x00200000 : UInt32) -protected def S : GC := .Sm ||| .Sc ||| .Sk ||| .So - -protected def Zs : GC := (0x00400000 : UInt32) -protected def Zl : GC := (0x00800000 : UInt32) -protected def Zp : GC := (0x01000000 : UInt32) -protected def Z : GC := .Zs ||| .Zl ||| .Zp - -protected def Cc : GC := (0x02000000 : UInt32) -protected def Cf : GC := (0x04000000 : UInt32) -protected def Cs : GC := (0x08000000 : UInt32) -protected def Co : GC := (0x10000000 : UInt32) -protected def Cn : GC := (0x20000000 : UInt32) -protected def C : GC := .Cc ||| .Cf ||| .Cs ||| .Co ||| .Cn - -def mk : (major : MajorGeneralCategory) → Option (MinorGeneralCategory major) → GC -| .letter, none => .L -| _, some .casedLetter => .LC -| _, some .uppercaseLetter => .Lu -| _, some .lowercaseLetter => .Ll -| _, some .titlecaseLetter => .Lt -| _, some .modifierLetter => .Lm -| _, some .otherLetter => .Lo -| .mark, none => .M -| _, some .nonspacingMark => .Mn -| _, some .spacingMark => .Mc -| _, some .enclosingMark => .Me -| .number, none => .N -| _, some .decimalNumber => .Nd -| _, some .letterNumber => .Nl -| _, some .otherNumber => .No -| .punctuation, none => .P -| _, some .connectorPunctuation => .Pc -| _, some .dashPunctuation => .Pd -| _, some .groupPunctuation => .PG -| _, some .openPunctuation => .Ps -| _, some .closePunctuation => .Pe -| _, some .quotePunctuation => .PQ -| _, some .initialPunctuation => .Pi -| _, some .finalPunctuation => .Pf -| _, some .otherPunctuation => .Po -| .symbol, none => .S -| _, some .mathSymbol => .Sm -| _, some .currencySymbol => .Sc -| _, some .modifierSymbol => .Sk -| _, some .otherSymbol => .So -| .separator, none => .Z -| _, some .spaceSeparator => .Zs -| _, some .lineSeparator => .Zl -| _, some .paragraphSeparator => .Zp -| .other, none => .C -| _, some .control => .Cc -| _, some .format => .Cf -| _, some .surrogate => .Cs -| _, some .privateUse => .Co -| _, some .unassigned => .Cn - -private def reprAux (x : GC) (extra := false) : List String := Id.run do +public instance (x y : GC) : Decidable (x ⊆ y) := inferInstanceAs (Decidable (_ == _)) + +public protected def none : GC := (0x00000000 : UInt32) +public protected def univ : GC := (0x3FFFFFFF : UInt32) + +public protected def Lu : GC := (0x00000001 : UInt32) +public protected def Ll : GC := (0x00000002 : UInt32) +public protected def Lt : GC := (0x00000004 : UInt32) +public protected def Lm : GC := (0x00000008 : UInt32) +public protected def Lo : GC := (0x00000010 : UInt32) +public protected def LC : GC := .Lu ||| .Ll ||| .Lt +public protected def L : GC := .Lu ||| .Ll ||| .Lt ||| .Lm ||| .Lo + +public protected def Mn : GC := (0x00000020 : UInt32) +public protected def Mc : GC := (0x00000040 : UInt32) +public protected def Me : GC := (0x00000080 : UInt32) +public protected def M : GC := .Mn ||| .Mc ||| .Me + +public protected def Nd : GC := (0x00000100 : UInt32) +public protected def Nl : GC := (0x00000200 : UInt32) +public protected def No : GC := (0x00000400 : UInt32) +public protected def N : GC := .Nd ||| .Nl ||| .No + +public protected def Pc : GC := (0x00000800 : UInt32) +public protected def Pd : GC := (0x00001000 : UInt32) +public protected def Ps : GC := (0x00002000 : UInt32) +public protected def Pe : GC := (0x00004000 : UInt32) +public protected def Pi : GC := (0x00008000 : UInt32) +public protected def Pf : GC := (0x00010000 : UInt32) +public protected def Po : GC := (0x00020000 : UInt32) +public protected def PG : GC := .Ps ||| .Pe +public protected def PQ : GC := .Pi ||| .Pf +public protected def P : GC := .Pc ||| .Pd ||| .Ps ||| .Pe ||| .Pi ||| .Pf ||| .Po + +public protected def Sm : GC := (0x00040000 : UInt32) +public protected def Sc : GC := (0x00080000 : UInt32) +public protected def Sk : GC := (0x00100000 : UInt32) +public protected def So : GC := (0x00200000 : UInt32) +public protected def S : GC := .Sm ||| .Sc ||| .Sk ||| .So + +public protected def Zs : GC := (0x00400000 : UInt32) +public protected def Zl : GC := (0x00800000 : UInt32) +public protected def Zp : GC := (0x01000000 : UInt32) +public protected def Z : GC := .Zs ||| .Zl ||| .Zp + +public protected def Cc : GC := (0x02000000 : UInt32) +public protected def Cf : GC := (0x04000000 : UInt32) +public protected def Cs : GC := (0x08000000 : UInt32) +public protected def Co : GC := (0x10000000 : UInt32) +public protected def Cn : GC := (0x20000000 : UInt32) +public protected def C : GC := .Cc ||| .Cf ||| .Cs ||| .Co ||| .Cn + +def reprAux (x : GC) (extra := false) : List String := Id.run do let mut c := #[] if .L ⊆ x then c := c.push "L" @@ -412,19 +272,19 @@ private def reprAux (x : GC) (extra := false) : List String := Id.run do return c.toList @[inline] -def toAbbrev! (x : GC) : String := +public def toAbbrev! (x : GC) : String := match reprAux x true with | [a] => a | _ => panic! "invalid general category" open Std.Format Repr in - def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) - instance : Repr GC where reprPrec +public def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) +instance : Repr GC where reprPrec -def toString (x : GC) := " | ".intercalate (reprAux x) +public def toString (x : GC) := " | ".intercalate (reprAux x) instance : ToString GC where toString -def ofAbbrev? (s : String.Slice) : Option GC := +public def ofAbbrev? (s : String.Slice) : Option GC := match s.chars.take 3 |>.toList with | ['C'] => some .C | ['C', 'c'] => some .Cc @@ -468,18 +328,18 @@ def ofAbbrev? (s : String.Slice) : Option GC := | ['Z', 'p'] => some .Zp | _ => none -def ofAbbrev! (s : String.Slice) : GC := +public def ofAbbrev! (s : String.Slice) : GC := match ofAbbrev? s with | .some c => c | none => panic! "invalid general category" -def ofString? (s : String.Slice) : Option GC := do +public def ofString? (s : String.Slice) : Option GC := do let mut c := .none for a in s.split "|" do c := c ||| (← GC.ofAbbrev? a.trimAscii) return c -def ofString! (s : String.Slice) : GC := +public def ofString! (s : String.Slice) : GC := match ofString? s with | .some c => c | none => panic! "invalid general category" @@ -493,13 +353,13 @@ end GC /-- Unicode numeric type Unicode properties: `Numeric_Type`, `Numeric_Value` -/ -inductive NumericType +public inductive NumericType /-- Decimal digit type and value -/ -| decimal (value : Fin 10) : NumericType +| public decimal (value : Fin 10) : NumericType /-- Digit type and value -/ -| digit (value : Fin 10) : NumericType +| public digit (value : Fin 10) : NumericType /-- Numeric type and value -/ -| numeric (num : Int) (den : Option Nat) : NumericType +| public numeric (num : Int) (den : Option Nat) : NumericType deriving Inhabited, DecidableEq, Repr /-- Decimal digit type @@ -509,7 +369,7 @@ deriving Inhabited, DecidableEq, Repr Unicode property: `Numeric_Type` -/ -def NumericType.isDecimal : NumericType → Bool +public def NumericType.isDecimal : NumericType → Bool | decimal _ => true | _ => false @@ -519,7 +379,7 @@ def NumericType.isDecimal : NumericType → Bool Unicode property: `Numeric_Type` -/ -def NumericType.isDigit : NumericType → Bool +public def NumericType.isDigit : NumericType → Bool | decimal _ => true | digit _ => true | _ => false @@ -531,7 +391,7 @@ def NumericType.isDigit : NumericType → Bool Unicode property: `Numeric_Value` -/ -def NumericType.value : NumericType → Int ⊕ Int × Nat +public def NumericType.value : NumericType → Int ⊕ Int × Nat | decimal n => .inl n | digit n => .inl n | numeric n none => .inl n @@ -544,39 +404,39 @@ def NumericType.value : NumericType → Int ⊕ Int × Nat /-- Compatibility format tag Unicode properties: `Decomposition_Type`, `Decomposition_Mapping` -/ -inductive CompatibilityTag +public inductive CompatibilityTag /-- Font variant -/ -| font +| public font /-- No-break version of a space or hyphen -/ -| noBreak +| public noBreak /-- Initial presentation form (Arabic) -/ -| initial +| public initial /-- Medial presentation form (Arabic) -/ -| medial +| public medial /-- Final presentation form (Arabic) -/ -| final +| public final /-- Isolated presentation form (Arabic) -/ -| isolated +| public isolated /-- Encircled form -/ -| circle +| public circle /-- Superscript form -/ -| super +| public super /-- Subscript form -/ -| sub +| public sub /-- Vertical layout presentation form -/ -| vertical +| public vertical /-- Wide (or zenkaku) compatibility character -/ -| wide +| public wide /-- Narrow (or hankaku) compatibility character -/ -| narrow +| public narrow /-- Small variant form (CNS compatibility) -/ -| small +| public small /-- CJK squared font variant -/ -| square +| public square /-- Vulgar fraction form -/ -| fraction +| public fraction /-- Otherwise unspecified compatibility character -/ -| compat +| public compat deriving Inhabited, DecidableEq, Repr instance : ToString CompatibilityTag where @@ -601,11 +461,11 @@ instance : ToString CompatibilityTag where /-- Decomposition maping Unicode properties: `Decomposition_Type`, `Decomposition_Mapping` -/ -structure DecompositionMapping where +public structure DecompositionMapping where /-- Compatibility format tag -/ - tag : Option CompatibilityTag + public tag : Option CompatibilityTag /-- Decomposition mapping -/ - mapping : List Char + public mapping : List Char deriving Inhabited, DecidableEq, Repr /-! @@ -615,104 +475,104 @@ deriving Inhabited, DecidableEq, Repr /-- Bidirectional class Unicode property: `Bidi_Class` -/ -inductive BidiClass +public inductive BidiClass /-- (`L`) strong left-to-right character -/ -| leftToRight +| public leftToRight /-- (`R`) strong right-to-left (non-Arabic-type) character -/ -| rightToLeft +| public rightToLeft /-- (`AL`) strong right-to-left (Arabic-type) character -/ -| arabicLetter +| public arabicLetter /-- (`EN`) ASCII digit or Eastern Arabic-Indic digit -/ -| europeanNumber +| public europeanNumber /-- (`ES`) European separator: plus and-/ -| europeanSeparator +| public europeanSeparator /-- (`ET`) European terminator in a numeric format context, includes currency signs -/ -| europeanTerminator +| public europeanTerminator /-- (`AN`) Arabic-Indic digit -/ -| arabicNumber +| public arabicNumber /-- (`CS`) common separator: commas, colons, and slashes -/ -| commonSeparator +| public commonSeparator /-- (`NSM`) nonspacing mark -/ -| nonspacingMark +| public nonspacingMark /-- (`BN`) boundary neutral: most format characters, control codes, or noncharacters -/ -| boundaryNeutral +| public boundaryNeutral /-- (`B`) paragraph separator, various newline characters -/ -| paragraphSeparator +| public paragraphSeparator /-- (`S`) segment separator, various segment-related control codes -/ -| segmentSeparator +| public segmentSeparator /-- (`WS`) white spaces -/ -| whiteSpace +| public whiteSpace /-- (`ON`) other neutral: most other symbols and punctuation marks -/ -| otherNeutral +| public otherNeutral /-- (`LRE`) left to right embedding (U+202A: the LR embedding control) -/ -| leftToRightEmbedding +| public leftToRightEmbedding /-- (`LRO`) Left_To_Right_Override (U+202D: the LR override control) -/ -| leftToRightOverride +| public leftToRightOverride /-- (`RLE`) right-to-left embedding (U+202B: the RL embedding control) -/ -| rightToLeftEmbeding +| public rightToLeftEmbeding /-- (`RLO`) right-to-left override (U+202E: the RL override control) -/ -| rightToLeftOverride +| public rightToLeftOverride /-- (`PDF`) pop directional format (U+202C: terminates an embedding or override control) -/ -| popDirectionalFormat +| public popDirectionalFormat /-- (`LRI`) left-to-right isolate (U+2066: the LR isolate control) -/ -| leftToRightIsolate +| public leftToRightIsolate /-- (`RLI`) right-toleft isolate (U+2067: the RL isolate control) -/ -| rightToLeftIsolate +| public rightToLeftIsolate /-- (`FSI`) first strong isolate (U+2068: the first strong isolate control) -/ -| firstStrongIsolate +| public firstStrongIsolate /-- (`PDI`) pop directional isolate (U+2069: terminates an isolate control) -/ -| popDirectionalIsolate +| public popDirectionalIsolate deriving Inhabited, DecidableEq /-- Bidi class: strong left-to-right (`L`) -/ -protected def BidiClass.L := leftToRight +public protected def BidiClass.L := leftToRight /-- Bidi class: strong right-to-left (`R`) -/ -protected def BidiClass.R := rightToLeft +public protected def BidiClass.R := rightToLeft /-- Bidi class: arabic letter (`AL`) -/ -protected def BidiClass.AL := arabicLetter +public protected def BidiClass.AL := arabicLetter /-- Bidi class: european number (`EN`) -/ -protected def BidiClass.EN := europeanNumber +public protected def BidiClass.EN := europeanNumber /-- Bidi class: european separator (`ES`) -/ -protected def BidiClass.ES := europeanSeparator +public protected def BidiClass.ES := europeanSeparator /-- Bidi class: european terminator (`ET`) -/ -protected def BidiClass.ET := europeanTerminator +public protected def BidiClass.ET := europeanTerminator /-- Bidi class: arabic number (`AN`) -/ -protected def BidiClass.AN := arabicNumber +public protected def BidiClass.AN := arabicNumber /-- Bidi class: common separator (`CS`) -/ -protected def BidiClass.CS := commonSeparator +public protected def BidiClass.CS := commonSeparator /-- Bidi class: nonspacing mark (`NSM`) -/ -protected def BidiClass.NSM := nonspacingMark +public protected def BidiClass.NSM := nonspacingMark /-- Bidi class: boundary neutral (`BN`) -/ -protected def BidiClass.BN := boundaryNeutral +public protected def BidiClass.BN := boundaryNeutral /-- Bidi class: paragraph separator (`B`) -/ -protected def BidiClass.B := paragraphSeparator +public protected def BidiClass.B := paragraphSeparator /-- Bidi class: segment separator (`S`) -/ -protected def BidiClass.S := segmentSeparator +public protected def BidiClass.S := segmentSeparator /-- Bidi class: white space (`WS`) -/ -protected def BidiClass.WS := whiteSpace +public protected def BidiClass.WS := whiteSpace /-- Bidi class: other neutral (`ON`) -/ -protected def BidiClass.ON := otherNeutral +public protected def BidiClass.ON := otherNeutral /-- Bidi class: left-to-right embedding (`LRE`) -/ -protected def BidiClass.LRE := leftToRightEmbedding +public protected def BidiClass.LRE := leftToRightEmbedding /-- Bidi class: left-to-right override (`LRO`) -/ -protected def BidiClass.LRO := leftToRightOverride +public protected def BidiClass.LRO := leftToRightOverride /-- Bidi class: right-to-left embedding (`RLE`) -/ -protected def BidiClass.RLE := rightToLeftEmbeding +public protected def BidiClass.RLE := rightToLeftEmbeding /-- Bidi class: right-to-left override (`RLO`) -/ -protected def BidiClass.RLO := rightToLeftOverride +public protected def BidiClass.RLO := rightToLeftOverride /-- Bidi class: pop directional format (`PDF`) -/ -protected def BidiClass.PDF := popDirectionalFormat +public protected def BidiClass.PDF := popDirectionalFormat /-- Bidi class: left-to-right isolate (`LRI`) -/ -protected def BidiClass.LRI := leftToRightIsolate +public protected def BidiClass.LRI := leftToRightIsolate /-- Bidi class: right-to-left isolate (`RLI`) -/ -protected def BidiClass.RLI := rightToLeftIsolate +public protected def BidiClass.RLI := rightToLeftIsolate /-- Bidi class: first strong isolate (`FSI`) -/ -protected def BidiClass.FSI := firstStrongIsolate +public protected def BidiClass.FSI := firstStrongIsolate /-- Bidi class: pop directional isolate (`PDI`) -/ -protected def BidiClass.PDI := popDirectionalIsolate +public protected def BidiClass.PDI := popDirectionalIsolate /-- String abbreviation for bidi class -/ -def BidiClass.toAbbrev : BidiClass → String +public def BidiClass.toAbbrev : BidiClass → String | leftToRight => "L" | rightToLeft => "R" | arabicLetter => "AL" @@ -738,7 +598,7 @@ def BidiClass.toAbbrev : BidiClass → String | popDirectionalIsolate => "PDI" /-- Get bidi class from abbreviation -/ -def BidiClass.ofAbbrev? (abbr : String.Slice) : Option BidiClass := +public def BidiClass.ofAbbrev? (abbr : String.Slice) : Option BidiClass := match abbr.chars.take 4 |>.toList with | ['L'] => some leftToRight | ['R'] => some rightToLeft @@ -766,7 +626,7 @@ def BidiClass.ofAbbrev? (abbr : String.Slice) : Option BidiClass := | _ => none @[inherit_doc BidiClass.ofAbbrev?] -def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass := +public def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass := match ofAbbrev? abbr with | some bc => bc | none => panic! "invalid bidi class abbreviation" @@ -780,7 +640,7 @@ instance : Repr BidiClass where /-- Check if valid script identifier -/ @[inline] -def Script.isValid (c : UInt32) : Bool := +public def Script.isValid (c : UInt32) : Bool := let c0 := (c >>> 24).toUInt8 let c1 := (c >>> 16).toUInt8 let c2 := (c >>> 8).toUInt8 @@ -791,15 +651,15 @@ def Script.isValid (c : UInt32) : Bool := && (c3 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c3) /-- Script identifier type -/ -structure Script where - code : UInt32 - is_valid : Script.isValid code +public structure Script where + public code : UInt32 + public is_valid : Script.isValid code deriving DecidableEq, Hashable namespace Script /-- Default value is `Zzzz` (`Unknown`) -/ -instance : Inhabited Script where +public instance : Inhabited Script where default := { code := (((('Z'.val <<< 8 ||| 'z'.val) <<< 8) ||| 'z'.val) <<< 8) ||| 'z'.val is_valid := by decide @@ -807,7 +667,7 @@ instance : Inhabited Script where /-- String abbreviation of script -/ @[extern "unicode_script_to_abbrev"] -def toAbbrev : Script → String +public def toAbbrev : Script → String | ⟨c, _⟩ => let c0 := Char.ofUInt8 (c >>> 24).toUInt8 let c1 := Char.ofUInt8 (c >>> 16).toUInt8 @@ -816,10 +676,10 @@ def toAbbrev : Script → String String.ofList [c0, c1, c2, c3] @[extern "unicode_script_of_abbrev"] -private opaque ofAbbrevAux (abbr : String) : UInt32 +opaque ofAbbrevAux (abbr : String) : UInt32 /-- Get script from abbreviation -/ -def ofAbbrev? (abbr : String.Slice) : Option Script := +public def ofAbbrev? (abbr : String.Slice) : Option Script := if abbr.utf8ByteSize = 4 then let code := ofAbbrevAux abbr.toString if h : Script.isValid code then @@ -830,6 +690,6 @@ def ofAbbrev? (abbr : String.Slice) : Option Script := none @[inline, inherit_doc ofAbbrev?] -def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get! +public def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get! end Script From b5a2cceec503c0e53c4c26bce655644e618cd00a Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 04:25:03 -0400 Subject: [PATCH 03/10] chore: cleanup UnicodeData --- UnicodeData/Aliases.lean | 36 +++++++++++------------ UnicodeData/Basic.lean | 60 +++++++++++++++++++-------------------- UnicodeData/PropList.lean | 42 +++++++++++++-------------- UnicodeData/Scripts.lean | 10 ++----- 4 files changed, 68 insertions(+), 80 deletions(-) diff --git a/UnicodeData/Aliases.lean b/UnicodeData/Aliases.lean index 665396de0..ae89f40d5 100644 --- a/UnicodeData/Aliases.lean +++ b/UnicodeData/Aliases.lean @@ -3,16 +3,14 @@ Copyright © 2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ module -public import Std.Data.HashMap import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase - -public section +public import Std.Data.HashMap namespace Unicode /-- Type for list of aliases -/ -structure Aliases where +public structure Aliases where aliasMap : Std.HashMap String.Slice (Array String.Slice) nameMap : Std.HashMap String.Slice String.Slice deriving Inhabited @@ -36,30 +34,30 @@ initialize PropertyAliases.data : Aliases ← do /-- Get the long name of a property -/ @[inline] -def PropertyAliases.getLongName? (prop : String.Slice) : Option String.Slice := do +public def PropertyAliases.getLongName? (prop : String.Slice) : Option String.Slice := do data.nameMap.get? prop @[inline, inherit_doc PropertyAliases.getLongName?] -def PropertyAliases.getLongName! (prop : String.Slice) : String.Slice := +public def PropertyAliases.getLongName! (prop : String.Slice) : String.Slice := getLongName? prop |>.get! /-- Get all aliases of a property -/ @[inline] -def PropertyAliases.getAliases? (prop : String.Slice) : Option (Array String.Slice) := do +public def PropertyAliases.getAliases? (prop : String.Slice) : Option (Array String.Slice) := do let prop ← getLongName? prop data.aliasMap.get? prop @[inline, inherit_doc PropertyAliases.getAliases?] -def PropertyAliases.getAliases! (prop : String.Slice) : Array String.Slice := +public def PropertyAliases.getAliases! (prop : String.Slice) : Array String.Slice := getAliases? prop |>.get! /-- Get the short name of a property -/ -def PropertyAliases.getShortName? (prop : String.Slice) : Option String.Slice := do +public def PropertyAliases.getShortName? (prop : String.Slice) : Option String.Slice := do let a ← getAliases? prop a[1]? <|> a[0]? @[inline, inherit_doc PropertyAliases.getShortName?] -def PropertyAliases.getShortName! (prop : String.Slice) : String.Slice := +public def PropertyAliases.getShortName! (prop : String.Slice) : String.Slice := getShortName? prop |>.get! protected def PropertyValueAliases.txt := include_str "../data/PropertyValueAliases.txt" @@ -84,46 +82,44 @@ initialize PropertyValueAliases.data : Std.HashMap String.Slice Aliases ← do /-- Get all values for a property -/ @[inline] -def PropertyAliases.getValues? (prop : String.Slice) : Option (Array String.Slice) := do +public def PropertyAliases.getValues? (prop : String.Slice) : Option (Array String.Slice) := do let prop ← PropertyAliases.getLongName? prop let al ← PropertyValueAliases.data.get? prop return al.aliasMap.keysArray @[inline, inherit_doc PropertyAliases.getValues?] -def PropertyAliases.getValues! (prop : String.Slice) : Array String.Slice := +public def PropertyAliases.getValues! (prop : String.Slice) : Array String.Slice := getValues? prop |>.get! /-- Get the long name of a property value -/ @[inline] -def PropertyValueAliases.getLongName? (prop val : String.Slice) : Option String.Slice := do +public def PropertyValueAliases.getLongName? (prop val : String.Slice) : Option String.Slice := do let prop ← PropertyAliases.getLongName? prop let al ← data.get? prop al.nameMap.get? val @[inline, inherit_doc PropertyValueAliases.getLongName?] -def PropertyValueAliases.getLongName! (prop val : String.Slice) : String.Slice := +public def PropertyValueAliases.getLongName! (prop val : String.Slice) : String.Slice := getLongName? prop val |>.get! /-- Get all aliases of a property value -/ @[inline] -def PropertyValueAliases.getAliases? (prop val : String.Slice) : Option (Array String.Slice) := do +public def PropertyValueAliases.getAliases? (prop val : String.Slice) : Option (Array String.Slice) := do let prop ← PropertyAliases.getLongName? prop let val ← getLongName? prop val let al ← data.get? prop al.aliasMap.get? val @[inline, inherit_doc PropertyAliases.getAliases?] -def PropertyValueAliases.getAliases! (prop val : String.Slice) : Array String.Slice := +public def PropertyValueAliases.getAliases! (prop val : String.Slice) : Array String.Slice := getAliases? prop val |>.get! /-- Get the short name of a property value -/ @[inline] -def PropertyValueAliases.getShortName? (prop val : String.Slice) : Option String.Slice := do +public def PropertyValueAliases.getShortName? (prop val : String.Slice) : Option String.Slice := do let a ← getAliases? prop val a[1]? <|> a[0]? @[inline, inherit_doc PropertyValueAliases.getShortName?] -def PropertyValueAliases.getShortName! (prop val : String.Slice) : String.Slice := +public def PropertyValueAliases.getShortName! (prop val : String.Slice) : String.Slice := getShortName? prop val |>.get! - -end Unicode diff --git a/UnicodeData/Basic.lean b/UnicodeData/Basic.lean index 63cfdf0cf..7386e8513 100644 --- a/UnicodeData/Basic.lean +++ b/UnicodeData/Basic.lean @@ -7,37 +7,35 @@ import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul public import UnicodeBasic.Types -public section - namespace Unicode /-- Structure for data from `UnicodeData.txt` -/ -structure UnicodeData where +public structure UnicodeData where /-- Code Value -/ - code : UInt32 + public code : UInt32 /-- Character Name -/ - name : String.Slice + public name : String.Slice /-- General Category -/ - gc : GC + public gc : GC /-- Bidirectional Class -/ - bidi : BidiClass + public bidi : BidiClass /-- Canonical Combining Class -/ - cc : Nat := 0 + public cc : Nat := 0 /-- Bidirectional Mirrored -/ - bidiMirrored : Bool := false + public bidiMirrored : Bool := false /-- Character Decomposition Mapping -/ - decomp : Option DecompositionMapping := none + public decomp : Option DecompositionMapping := none /-- Numeric Value -/ - numeric : Option NumericType := none + public numeric : Option NumericType := none /-- Uppercase Mapping -/ - uppercase : Option Char := none + public uppercase : Option Char := none /-- Lowercase Mapping -/ - lowercase : Option Char := none + public lowercase : Option Char := none /-- Titlecase Mapping -/ - titlecase : Option Char := none + public titlecase : Option Char := none deriving BEq -instance : Inhabited UnicodeData where +public instance : Inhabited UnicodeData where default := { code := 0 name := "" @@ -46,49 +44,49 @@ instance : Inhabited UnicodeData where } /-- Make `UnicodeData` for noncharacter code point -/ -def UnicodeData.mkNoncharacter (code : UInt32) : UnicodeData where +public def UnicodeData.mkNoncharacter (code : UInt32) : UnicodeData where code := code name := -- Extracted from property `Noncharacter_Code_Point` let isReserved := (code &&& 0xFFFFFFF0 == 0x0000FDD0) || (code &&& 0xFFFFFFF0 == 0x0000FDE0) || (code &&& 0x0000FFFE == 0x0000FFFE) - (if isReserved then "" + (if isReserved then "" bidi := .BN gc := .Cn /-- Make `UnicodeData` for generic control code point -/ def UnicodeData.mkControl (c : UInt32) : UnicodeData where code := c - name := s!"" + name := s!"" bidi := .BN gc := .Cc /-- Make `UnicodeData` for generic surrogate code point -/ def UnicodeData.mkSurrogate (c : UInt32) : UnicodeData where code := c - name := s!"" + name := s!"" bidi := .L gc := .Cs /-- Make `UnicodeData` for generic private use code point -/ def UnicodeData.mkPrivateUse (c : UInt32) : UnicodeData where code := c - name := s!"" + name := s!"" bidi := .L gc := .Co /-- Make `UnicodeData` for CJK compatibilty ideograph code point -/ def UnicodeData.mkCJKCompatibilityIdeograph (c : UInt32) : UnicodeData where code := c - name := s!"CJK COMPATIBILITY IDEOGRAPH-{toHexStringAux c}" + name := s!"CJK COMPATIBILITY IDEOGRAPH-{toHexStringRaw c}" bidi := .L gc := .Lo /-- Make `UnicodeData` for CJK unified ideograph code point -/ def UnicodeData.mkCJKUnifiedIdeograph (c : UInt32) : UnicodeData where code := c - name := s!"CJK UNIFIED IDEOGRAPH-{toHexStringAux c}" + name := s!"CJK UNIFIED IDEOGRAPH-{toHexStringRaw c}" bidi := .L gc := .Lo @@ -109,7 +107,7 @@ def UnicodeData.mkHangulSyllable (c : UInt32) : UnicodeData := /-- Make `UnicodeData` for Tangut ideograph code point -/ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where code := c - name := s!"TANGUT IDEOGRAPH-{toHexStringAux c}" + name := s!"TANGUT IDEOGRAPH-{toHexStringRaw c}" bidi := .L gc := .Lo @@ -237,7 +235,7 @@ unsafe initialize UnicodeData.data : Array UnicodeData ← return arr /-- Get code point data from `UnicodeData.txt` -/ -partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do +public partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do if code > Unicode.max then none else if code ≤ 0x0377 then @@ -252,7 +250,7 @@ partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do assert! (data.code == code) if data.name == "" then return {data with - name := s!""} + name := s!""} else return data else @@ -316,25 +314,25 @@ where find mid hi @[inherit_doc getUnicodeData?] -def getUnicodeData! (code : UInt32) := +public def getUnicodeData! (code : UInt32) := match getUnicodeData? code with | some data => data | none => panic! "code point out of range" /-- Get character data from `UnicodeData.txt` -/ -def getUnicodeData (char : Char) : UnicodeData := +public def getUnicodeData (char : Char) : UnicodeData := match getUnicodeData? char.val with | some data => data | none => unreachable! /-- Stream type to roll through all code points up to `Unicode.max`, yielding `UnicodeData` -/ -structure UnicodeDataStream where +public structure UnicodeDataStream where code : UInt32 := 0 index : USize := 0 default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter deriving Inhabited -def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do +public def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do let c := s.code let i := s.index if c > Unicode.max then @@ -345,7 +343,7 @@ def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × Uni if c < d.code then return (s.default c, {s with code := c+1}) else if n == "" then - let d := {d with name := s!""} + let d := {d with name := s!""} return (d, {s with code := c+1, index := i+1}) else if n.front == '<' then if n.takeEnd 8 == ", First>" then @@ -372,5 +370,5 @@ def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × Uni else return (.mkNoncharacter c, {s with code := c+1}) -instance : Std.Stream UnicodeDataStream UnicodeData where +public instance : Std.Stream UnicodeDataStream UnicodeData where next? := UnicodeDataStream.next? diff --git a/UnicodeData/PropList.lean b/UnicodeData/PropList.lean index 8a2133be8..aade17f29 100644 --- a/UnicodeData/PropList.lean +++ b/UnicodeData/PropList.lean @@ -6,30 +6,28 @@ module import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase -public section - namespace Unicode /-- Structure containing supported character properties from `PropList.txt` -/ -structure PropList where +public structure PropList where /-- property `Noncharacter_Code_Point` -/ - noncharacterCodePoint : Array (UInt32 × Option UInt32) := #[] + public noncharacterCodePoint : Array (UInt32 × Option UInt32) := #[] /-- property `White_Space` -/ - whiteSpace : Array (UInt32 × Option UInt32) := #[] + public whiteSpace : Array (UInt32 × Option UInt32) := #[] /-- property `Other_Math` -/ - otherMath : Array (UInt32 × Option UInt32) := #[] + public otherMath : Array (UInt32 × Option UInt32) := #[] /-- property `Other_Alphabetic` -/ - otherAlphabetic : Array (UInt32 × Option UInt32) := #[] + public otherAlphabetic : Array (UInt32 × Option UInt32) := #[] /-- property `Other_Lowercase` -/ - otherLowercase : Array (UInt32 × Option UInt32) := #[] + public otherLowercase : Array (UInt32 × Option UInt32) := #[] /-- property `Other_Uppercase` -/ - otherUppercase : Array (UInt32 × Option UInt32) := #[] + public otherUppercase : Array (UInt32 × Option UInt32) := #[] /-- property `Other_Default_Ignorable_Code_Point` -/ - otherDefaultIgnorableCodePoint : Array (UInt32 × Option UInt32) := #[] + public otherDefaultIgnorableCodePoint : Array (UInt32 × Option UInt32) := #[] /-- property `Prepended_Concatenation_Mark` -/ - prependedConcatenationMark : Array (UInt32 × Option UInt32) := #[] + public prependedConcatenationMark : Array (UInt32 × Option UInt32) := #[] /-- property `Variation_Selector` -/ - variationSelector : Array (UInt32 × Option UInt32) := #[] + public variationSelector : Array (UInt32 × Option UInt32) := #[] /-- property `Deprecated` -/ deprecated : Array (UInt32 × Option UInt32) := #[] deriving Inhabited, Repr @@ -85,7 +83,7 @@ private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32) /-- Check if code point has `Noncharacter_Code_Point` property from `PropList.txt` -/ @[inline] -def PropList.isNoncharacterCodePoint (code : UInt32) : Bool := +public def PropList.isNoncharacterCodePoint (code : UInt32) : Bool := let data := PropList.data.noncharacterCodePoint if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -94,7 +92,7 @@ def PropList.isNoncharacterCodePoint (code : UInt32) : Bool := /-- Check if code point has `White_Space` property from `PropList.txt` -/ @[inline] -def PropList.isWhiteSpace (code : UInt32) : Bool := +public def PropList.isWhiteSpace (code : UInt32) : Bool := let data := PropList.data.whiteSpace if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -103,7 +101,7 @@ def PropList.isWhiteSpace (code : UInt32) : Bool := /-- Check if code point has `Other_Math` property from `PropList.txt` -/ @[inline] -def PropList.isOtherMath (code : UInt32) : Bool := +public def PropList.isOtherMath (code : UInt32) : Bool := let data := PropList.data.otherMath if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -112,7 +110,7 @@ def PropList.isOtherMath (code : UInt32) : Bool := /-- Check if code point has `Other_Alphabetic` property from `PropList.txt` -/ @[inline] -def PropList.isOtherAlphabetic (code : UInt32) : Bool := +public def PropList.isOtherAlphabetic (code : UInt32) : Bool := let data := PropList.data.otherAlphabetic if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -121,7 +119,7 @@ def PropList.isOtherAlphabetic (code : UInt32) : Bool := /-- Check if code point has `Other_Lowercase` property from `PropList.txt` -/ @[inline] -def PropList.isOtherLowercase (code : UInt32) : Bool := +public def PropList.isOtherLowercase (code : UInt32) : Bool := let data := PropList.data.otherLowercase if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -130,7 +128,7 @@ def PropList.isOtherLowercase (code : UInt32) : Bool := /-- Check if code point has `Other_Uppercase` property from `PropList.txt` -/ @[inline] -def PropList.isOtherUppercase (code : UInt32) : Bool := +public def PropList.isOtherUppercase (code : UInt32) : Bool := let data := PropList.data.otherUppercase if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -139,7 +137,7 @@ def PropList.isOtherUppercase (code : UInt32) : Bool := /-- Check if code point has `Other_Default_Ignorable_Code_Point` property from `PropList.txt` -/ @[inline] -def PropList.isOtherDefaultIgnorableCodePoint (code : UInt32) : Bool := +public def PropList.isOtherDefaultIgnorableCodePoint (code : UInt32) : Bool := let data := PropList.data.otherDefaultIgnorableCodePoint if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -148,7 +146,7 @@ def PropList.isOtherDefaultIgnorableCodePoint (code : UInt32) : Bool := /-- Check if code point has `Prepended_Concatenation_Mark` property from `PropList.txt` -/ @[inline] -def PropList.isPrependedConcatenationMark (code : UInt32) : Bool := +public def PropList.isPrependedConcatenationMark (code : UInt32) : Bool := let data := PropList.data.prependedConcatenationMark if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -157,7 +155,7 @@ def PropList.isPrependedConcatenationMark (code : UInt32) : Bool := /-- Check if code point has `Variation_Selector` property from `PropList.txt` -/ @[inline] -def PropList.isVariationSelector (code : UInt32) : Bool := +public def PropList.isVariationSelector (code : UInt32) : Bool := let data := PropList.data.variationSelector if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with @@ -166,7 +164,7 @@ def PropList.isVariationSelector (code : UInt32) : Bool := /-- Check if code point has `Deprecated` property from `PropList.txt` -/ @[inline] -def PropList.isDeprecated (code : UInt32) : Bool := +public def PropList.isDeprecated (code : UInt32) : Bool := let data := PropList.data.deprecated if data.size == 0 || code < data[0]!.fst then false else match data[find code data 0 data.size]! with diff --git a/UnicodeData/Scripts.lean b/UnicodeData/Scripts.lean index 221d22325..225db2971 100644 --- a/UnicodeData/Scripts.lean +++ b/UnicodeData/Scripts.lean @@ -7,12 +7,10 @@ public import UnicodeData.Aliases import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase -public section - namespace Unicode /-- Type for scripts data -/ -abbrev Scripts := Std.HashMap String.Slice (Array (UInt32 × UInt32)) +public abbrev Scripts := Std.HashMap String.Slice (Array (UInt32 × UInt32)) /-- Raw string from `Scripts.txt` -/ def Scripts.txt := include_str "../data/Scripts.txt" @@ -39,12 +37,10 @@ initialize Scripts.data : Scripts ← do /-- Get table for given script -/ @[inline] -def Scripts.getTable? (sc : String.Slice) : Option <| Array (UInt32 × UInt32) := do +public def Scripts.getTable? (sc : String.Slice) : Option <| Array (UInt32 × UInt32) := do let sc ← PropertyValueAliases.getLongName! "Script" sc data.get? sc @[inline, inherit_doc Scripts.getTable?] -def Scripts.getTable! (sc : String.Slice) : Array (UInt32 × UInt32) := +public def Scripts.getTable! (sc : String.Slice) : Array (UInt32 × UInt32) := getTable? sc |>.get! - -end Unicode From 07e15cefc453e79f0882071c720f12e4901efc27 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 04:26:11 -0400 Subject: [PATCH 04/10] fix: expose type --- UnicodeBasic/Types.lean | 1 + 1 file changed, 1 insertion(+) diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index fce044543..5f64e0b1d 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -391,6 +391,7 @@ public def NumericType.isDigit : NumericType → Bool Unicode property: `Numeric_Value` -/ +@[expose] public def NumericType.value : NumericType → Int ⊕ Int × Nat | decimal n => .inl n | digit n => .inl n From 6b0500da24abe34f8a2836629d9c784281a98bbd Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 04:26:11 -0400 Subject: [PATCH 05/10] fix: public data --- UnicodeData/Basic.lean | 2 +- UnicodeData/PropList.lean | 2 +- UnicodeData/Scripts.lean | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/UnicodeData/Basic.lean b/UnicodeData/Basic.lean index 7386e8513..f94240eef 100644 --- a/UnicodeData/Basic.lean +++ b/UnicodeData/Basic.lean @@ -115,7 +115,7 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where protected def UnicodeData.txt := include_str "../data/UnicodeData.txt" /-- Parse `UnicodeData.txt` -/ -unsafe initialize UnicodeData.data : Array UnicodeData ← +public unsafe initialize UnicodeData.data : Array UnicodeData ← let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do /- The value of the `Decomposition_Mapping` property for a character is diff --git a/UnicodeData/PropList.lean b/UnicodeData/PropList.lean index aade17f29..b5fc97632 100644 --- a/UnicodeData/PropList.lean +++ b/UnicodeData/PropList.lean @@ -35,7 +35,7 @@ deriving Inhabited, Repr /-- Raw string form `PropList.txt` -/ protected def PropList.txt := include_str "../data/PropList.txt" -unsafe initialize PropList.data : PropList ← +public unsafe initialize PropList.data : PropList ← let stream := UCDStream.ofString PropList.txt let mut list : PropList := {} for record in stream do diff --git a/UnicodeData/Scripts.lean b/UnicodeData/Scripts.lean index 225db2971..39fc83bce 100644 --- a/UnicodeData/Scripts.lean +++ b/UnicodeData/Scripts.lean @@ -15,7 +15,7 @@ public abbrev Scripts := Std.HashMap String.Slice (Array (UInt32 × UInt32)) /-- Raw string from `Scripts.txt` -/ def Scripts.txt := include_str "../data/Scripts.txt" -initialize Scripts.data : Scripts ← do +public initialize Scripts.data : Scripts ← do let stream := UCDStream.ofString Scripts.txt let mut t := {} for record in stream do From d97721f512441c26c4d07557ba61762141c43680 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 04:38:12 -0400 Subject: [PATCH 06/10] chore: cleanup executables --- makeTables.lean | 118 ++++++++++++++++++++++++------------------------ testTables.lean | 2 +- 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/makeTables.lean b/makeTables.lean index f7c0e1a7c..00f745c69 100644 --- a/makeTables.lean +++ b/makeTables.lean @@ -144,9 +144,9 @@ def mkDecompositionMapping : IO <| Array (UInt32 × String) := do for data in UnicodeData.data do match data.decomp with | some ⟨none, l⟩ => - t := t.push (data.code, ";" ++ ";".intercalate (l.map (toHexStringAux <| Char.val .))) + t := t.push (data.code, ";" ++ ";".intercalate (l.map (toHexStringRaw <| Char.val .))) | some ⟨some k, l⟩ => - t := t.push (data.code, s!"{k};" ++ ";".intercalate (l.map (toHexStringAux <| Char.val ·))) + t := t.push (data.code, s!"{k};" ++ ";".intercalate (l.map (toHexStringRaw <| Char.val ·))) | _ => continue return t @@ -548,9 +548,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Bidi_Class" => IO.println s!"Generating table {arg}" @@ -558,9 +558,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, bc) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";;" ++ bc.toAbbrev + file.putStrLn <| toHexStringRaw c₀ ++ ";;" ++ bc.toAbbrev else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ ++ ";" ++ bc.toAbbrev + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ ++ ";" ++ bc.toAbbrev IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Bidi_Mirrored" => IO.println s!"Generating table {arg}" @@ -568,9 +568,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Canonical_Combining_Class" => IO.println s!"Generating table {arg}" @@ -578,16 +578,16 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, cc) in table do if c₀ == c₁ then - file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", toString cc] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", toString cc] else - file.putStrLn <| ";".intercalate [toHexStringAux c₀, toHexStringAux c₁, toString cc] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, toHexStringRaw c₁, toString cc] IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Canonical_Decomposition_Mapping" => IO.println s!"Generating table {arg}" let table ← mkCanonicalDecompositionMapping IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c, l) in table do - file.putStrLn <| toHexStringAux c ++ ";" ++ ";".intercalate (l.map fun c => toHexStringAux c.val) + file.putStrLn <| toHexStringRaw c ++ ";" ++ ";".intercalate (l.map fun c => toHexStringRaw c.val) IO.println s!"Size: {table.size}" | "Case_Mapping" => IO.println s!"Generating table {arg}" @@ -595,21 +595,21 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, uc, lc, tc) in table do if c₀ == c₁ then - file.putStr <| toHexStringAux c₀ ++ ";" + file.putStr <| toHexStringRaw c₀ ++ ";" if c₀ == uc then file.putStr <| ";" else - file.putStr <| ";" ++ toHexStringAux uc + file.putStr <| ";" ++ toHexStringRaw uc if c₀ == lc then file.putStr <| ";" else - file.putStr <| ";" ++ toHexStringAux lc + file.putStr <| ";" ++ toHexStringRaw lc else - file.putStr <| ";".intercalate <| [c₀, c₁, uc, lc].map toHexStringAux + file.putStr <| ";".intercalate <| [c₀, c₁, uc, lc].map toHexStringRaw if uc == tc then file.putStrLn ";" else - file.putStrLn <| ";" ++ toHexStringAux tc + file.putStrLn <| ";" ++ toHexStringRaw tc IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Cased" => IO.println s!"Generating table {arg}" @@ -617,16 +617,16 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Decomposition_Mapping" => IO.println s!"Generating table {arg}" let table ← mkDecompositionMapping IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c, s) in table do - file.putStrLn <| toHexStringAux c ++ ";" ++ s + file.putStrLn <| toHexStringRaw c ++ ";" ++ s IO.println s!"Size: {table.size}" | "Default_Ignorable_Code_Point" => IO.println s!"Generating table {arg}" @@ -634,9 +634,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "General_Category" => IO.println s!"Generating table {arg}" @@ -644,9 +644,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, v) in table do if c₀ == c₁ then - file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", toString v] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", toString v] else - file.putStrLn <| ";".intercalate [toHexStringAux c₀, toHexStringAux c₁, toString v] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, toHexStringRaw c₁, toString v] IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Lowercase" => IO.println s!"Generating table {arg}" @@ -654,9 +654,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Math" => IO.println s!"Generating table {arg}" @@ -664,9 +664,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Name" => IO.println s!"Generating table {arg}" @@ -674,9 +674,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, n) in table do if c₀ == c₁ then - file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", n] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", n] else - file.putStrLn <| ";".intercalate [toHexStringAux c₀, toHexStringAux c₁, n] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, toHexStringRaw c₁, n] IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Noncharacter_Code_Point" => IO.println s!"Generating table {arg}" @@ -684,9 +684,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Numeric_Value" => IO.println s!"Generating table {arg}" @@ -694,15 +694,15 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, n) in table do match n with - | .decimal _ => file.putStrLn <| ";".intercalate [toHexStringAux c₀, toHexStringAux c₁, "decimal"] + | .decimal _ => file.putStrLn <| ";".intercalate [toHexStringRaw c₀, toHexStringRaw c₁, "decimal"] | .digit v => if c₀ == c₁ then - file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", s!"digit {v.val}"] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", s!"digit {v.val}"] else let last := v.val + c₁.toNat - c₀.toNat - file.putStrLn <| ";".intercalate [toHexStringAux c₀, toHexStringAux c₁, s!"digit {v.val}-{last}"] - | .numeric v none => file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", s!"numeric {v}"] - | .numeric v (some d) => file.putStrLn <| ";".intercalate [toHexStringAux c₀, "", s!"numeric {v}/{d}"] + file.putStrLn <| ";".intercalate [toHexStringRaw c₀, toHexStringRaw c₁, s!"digit {v.val}-{last}"] + | .numeric v none => file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", s!"numeric {v}"] + | .numeric v (some d) => file.putStrLn <| ";".intercalate [toHexStringRaw c₀, "", s!"numeric {v}/{d}"] IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Other_Alphabetic" => IO.println s!"Generating table {arg}" @@ -710,9 +710,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Other_Default_Ignorable_Code_Point" => IO.println s!"Generating table {arg}" @@ -720,9 +720,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Other_Lowercase" => IO.println s!"Generating table {arg}" @@ -730,9 +730,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Other_Math" => IO.println s!"Generating table {arg}" @@ -740,9 +740,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Other_Uppercase" => IO.println s!"Generating table {arg}" @@ -750,9 +750,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Other" => IO.println s!"Generating table {arg}" @@ -760,9 +760,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁, v) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";;" ++ toString v + file.putStrLn <| toHexStringRaw c₀ ++ ";;" ++ toString v else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ ++ ";" ++ toString v + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ ++ ";" ++ toString v IO.println s!"Size: {(statsData table).1} + {(statsData table).2}" | "Prepended_Concatenation_Mark" => IO.println s!"Generating table {arg}" @@ -770,16 +770,16 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Script_Name" => IO.println s!"Generating table {arg}" let table := mkScriptName IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c, name) in table do - file.putStrLn <| toHexStringAux c ++ ";" ++ name + file.putStrLn <| toHexStringRaw c ++ ";" ++ name IO.println s!"Size: {table.size}" | "Titlecase" => IO.println s!"Generating table {arg}" @@ -787,9 +787,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Uppercase" => IO.println s!"Generating table {arg}" @@ -797,9 +797,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "Variation_Selector" => IO.println s!"Generating table {arg}" @@ -807,9 +807,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | "White_Space" => IO.println s!"Generating table {arg}" @@ -817,9 +817,9 @@ def main (args : List String) : IO UInt32 := do IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do for (c₀, c₁) in table do if c₀ == c₁ then - file.putStrLn <| toHexStringAux c₀ ++ ";" + file.putStrLn <| toHexStringRaw c₀ ++ ";" else - file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ + file.putStrLn <| toHexStringRaw c₀ ++ ";" ++ toHexStringRaw c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" | _ => IO.println s!"Unknown property {arg}" return 0 diff --git a/testTables.lean b/testTables.lean index 3a2c6f074..155f19d79 100644 --- a/testTables.lean +++ b/testTables.lean @@ -140,5 +140,5 @@ def main (args : List String) : IO UInt32 := do for t in tests do if t.1 ∈ args && !t.2 d then err := 1 - IO.println s!"Error: {t.1} {toHexStringAux d.code}" + IO.println s!"Error: {t.1} {toHexStringRaw d.code}" return err From adee7b1dfe4f3441f2cbb4556450db085c2a50ac Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 06:21:00 -0400 Subject: [PATCH 07/10] fix: makeTables --- UnicodeBasic/Types.lean | 102 ++++++++++++++++++++-------------------- makeTables.lean | 3 +- 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index 5f64e0b1d..e154528d1 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -127,55 +127,55 @@ public instance : HasSubset GC where public instance (x y : GC) : Decidable (x ⊆ y) := inferInstanceAs (Decidable (_ == _)) -public protected def none : GC := (0x00000000 : UInt32) -public protected def univ : GC := (0x3FFFFFFF : UInt32) - -public protected def Lu : GC := (0x00000001 : UInt32) -public protected def Ll : GC := (0x00000002 : UInt32) -public protected def Lt : GC := (0x00000004 : UInt32) -public protected def Lm : GC := (0x00000008 : UInt32) -public protected def Lo : GC := (0x00000010 : UInt32) -public protected def LC : GC := .Lu ||| .Ll ||| .Lt -public protected def L : GC := .Lu ||| .Ll ||| .Lt ||| .Lm ||| .Lo - -public protected def Mn : GC := (0x00000020 : UInt32) -public protected def Mc : GC := (0x00000040 : UInt32) -public protected def Me : GC := (0x00000080 : UInt32) -public protected def M : GC := .Mn ||| .Mc ||| .Me - -public protected def Nd : GC := (0x00000100 : UInt32) -public protected def Nl : GC := (0x00000200 : UInt32) -public protected def No : GC := (0x00000400 : UInt32) -public protected def N : GC := .Nd ||| .Nl ||| .No - -public protected def Pc : GC := (0x00000800 : UInt32) -public protected def Pd : GC := (0x00001000 : UInt32) -public protected def Ps : GC := (0x00002000 : UInt32) -public protected def Pe : GC := (0x00004000 : UInt32) -public protected def Pi : GC := (0x00008000 : UInt32) -public protected def Pf : GC := (0x00010000 : UInt32) -public protected def Po : GC := (0x00020000 : UInt32) -public protected def PG : GC := .Ps ||| .Pe -public protected def PQ : GC := .Pi ||| .Pf -public protected def P : GC := .Pc ||| .Pd ||| .Ps ||| .Pe ||| .Pi ||| .Pf ||| .Po - -public protected def Sm : GC := (0x00040000 : UInt32) -public protected def Sc : GC := (0x00080000 : UInt32) -public protected def Sk : GC := (0x00100000 : UInt32) -public protected def So : GC := (0x00200000 : UInt32) -public protected def S : GC := .Sm ||| .Sc ||| .Sk ||| .So - -public protected def Zs : GC := (0x00400000 : UInt32) -public protected def Zl : GC := (0x00800000 : UInt32) -public protected def Zp : GC := (0x01000000 : UInt32) -public protected def Z : GC := .Zs ||| .Zl ||| .Zp - -public protected def Cc : GC := (0x02000000 : UInt32) -public protected def Cf : GC := (0x04000000 : UInt32) -public protected def Cs : GC := (0x08000000 : UInt32) -public protected def Co : GC := (0x10000000 : UInt32) -public protected def Cn : GC := (0x20000000 : UInt32) -public protected def C : GC := .Cc ||| .Cf ||| .Cs ||| .Co ||| .Cn +public protected abbrev none : GC := (0x00000000 : UInt32) +public protected abbrev univ : GC := (0x3FFFFFFF : UInt32) + +public protected abbrev Lu : GC := (0x00000001 : UInt32) +public protected abbrev Ll : GC := (0x00000002 : UInt32) +public protected abbrev Lt : GC := (0x00000004 : UInt32) +public protected abbrev Lm : GC := (0x00000008 : UInt32) +public protected abbrev Lo : GC := (0x00000010 : UInt32) +public protected abbrev LC : GC := .Lu ||| .Ll ||| .Lt +public protected abbrev L : GC := .Lu ||| .Ll ||| .Lt ||| .Lm ||| .Lo + +public protected abbrev Mn : GC := (0x00000020 : UInt32) +public protected abbrev Mc : GC := (0x00000040 : UInt32) +public protected abbrev Me : GC := (0x00000080 : UInt32) +public protected abbrev M : GC := .Mn ||| .Mc ||| .Me + +public protected abbrev Nd : GC := (0x00000100 : UInt32) +public protected abbrev Nl : GC := (0x00000200 : UInt32) +public protected abbrev No : GC := (0x00000400 : UInt32) +public protected abbrev N : GC := .Nd ||| .Nl ||| .No + +public protected abbrev Pc : GC := (0x00000800 : UInt32) +public protected abbrev Pd : GC := (0x00001000 : UInt32) +public protected abbrev Ps : GC := (0x00002000 : UInt32) +public protected abbrev Pe : GC := (0x00004000 : UInt32) +public protected abbrev Pi : GC := (0x00008000 : UInt32) +public protected abbrev Pf : GC := (0x00010000 : UInt32) +public protected abbrev Po : GC := (0x00020000 : UInt32) +public protected abbrev PG : GC := .Ps ||| .Pe +public protected abbrev PQ : GC := .Pi ||| .Pf +public protected abbrev P : GC := .Pc ||| .Pd ||| .Ps ||| .Pe ||| .Pi ||| .Pf ||| .Po + +public protected abbrev Sm : GC := (0x00040000 : UInt32) +public protected abbrev Sc : GC := (0x00080000 : UInt32) +public protected abbrev Sk : GC := (0x00100000 : UInt32) +public protected abbrev So : GC := (0x00200000 : UInt32) +public protected abbrev S : GC := .Sm ||| .Sc ||| .Sk ||| .So + +public protected abbrev Zs : GC := (0x00400000 : UInt32) +public protected abbrev Zl : GC := (0x00800000 : UInt32) +public protected abbrev Zp : GC := (0x01000000 : UInt32) +public protected abbrev Z : GC := .Zs ||| .Zl ||| .Zp + +public protected abbrev Cc : GC := (0x02000000 : UInt32) +public protected abbrev Cf : GC := (0x04000000 : UInt32) +public protected abbrev Cs : GC := (0x08000000 : UInt32) +public protected abbrev Co : GC := (0x10000000 : UInt32) +public protected abbrev Cn : GC := (0x20000000 : UInt32) +public protected abbrev C : GC := .Cc ||| .Cf ||| .Cs ||| .Co ||| .Cn def reprAux (x : GC) (extra := false) : List String := Id.run do let mut c := #[] @@ -440,7 +440,7 @@ public inductive CompatibilityTag | public compat deriving Inhabited, DecidableEq, Repr -instance : ToString CompatibilityTag where +public instance : ToString CompatibilityTag where toString | .font => "" | .noBreak => "" @@ -632,7 +632,7 @@ public def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass := | some bc => bc | none => panic! "invalid bidi class abbreviation" -instance : Repr BidiClass where +public instance : Repr BidiClass where reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}" /-! diff --git a/makeTables.lean b/makeTables.lean index 00f745c69..dec724ea3 100644 --- a/makeTables.lean +++ b/makeTables.lean @@ -2,8 +2,9 @@ Copyright © 2024-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeData +public import UnicodeBasic open Unicode From 4d7e0bb73a5532cfe0ba3735d9bcd3f904704138 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 06:34:11 -0400 Subject: [PATCH 08/10] fix: modulize executables --- makeCLib.lean | 2 +- makeTables.lean | 2 +- testTables.lean | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/makeCLib.lean b/makeCLib.lean index 79d5e611b..bcc9e42fa 100644 --- a/makeCLib.lean +++ b/makeCLib.lean @@ -228,7 +228,7 @@ static const unicode_data_t table[] = {" file.putStrLn s!"\{UINT32_C({c₀}),UINT32_C({c₁}),UINT64_C({d})}," file.putStrLn "};" -def main : IO UInt32 := do +public def main : IO UInt32 := do makeCaseC makePropC makeScriptC diff --git a/makeTables.lean b/makeTables.lean index dec724ea3..4beb0a710 100644 --- a/makeTables.lean +++ b/makeTables.lean @@ -527,7 +527,7 @@ def mkScriptName : Array (UInt32 × String) := (s.code, name.toString) t.qsort fun (a, _) (b, _) => a < b -def main (args : List String) : IO UInt32 := do +public def main (args : List String) : IO UInt32 := do let args := if args != [] then args else [ "Bidi_Class", "Bidi_Mirrored", diff --git a/testTables.lean b/testTables.lean index 155f19d79..7148800a4 100644 --- a/testTables.lean +++ b/testTables.lean @@ -2,7 +2,7 @@ Copyright © 2024-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeBasic import UnicodeData @@ -132,7 +132,7 @@ def tests : Array (String × (UnicodeData → Bool)) := #[ ("General_Category", testGeneralCategory), ("White_Space", testWhiteSpace)] -def main (args : List String) : IO UInt32 := do +public def main (args : List String) : IO UInt32 := do let args := if args.isEmpty then tests.map Prod.fst else args.toArray let stream : UnicodeDataStream := {} let mut err : UInt32 := 0 From 8b2477424de6c2df6b68195194c307cefc81a538 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 07:18:19 -0400 Subject: [PATCH 09/10] chore: finish cleanup --- UnicodeBasic.lean | 2 -- lakefile.lean | 1 - makeCLib.lean | 1 + 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index 415abfe03..f6b6d30e1 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -732,5 +732,3 @@ public def isAlphabetic (char : Char) : Bool := @[inherit_doc isAlphabetic] public abbrev isAlpha := isAlphabetic - -end Unicode diff --git a/lakefile.lean b/lakefile.lean index 9b2d0eb68..7ec02ad0d 100644 --- a/lakefile.lean +++ b/lakefile.lean @@ -7,7 +7,6 @@ import Lake open System Lake DSL package UnicodeBasic where - version := v!"1.1.0" description := "Basic Unicode support for Lean 4" keywords := #["unicode"] reservoir := true diff --git a/makeCLib.lean b/makeCLib.lean index bcc9e42fa..a3df4b5d0 100644 --- a/makeCLib.lean +++ b/makeCLib.lean @@ -1,3 +1,4 @@ +module import UnicodeData open Unicode From a60d32ace40f240326e826bbd5d3213838f4f9c6 Mon Sep 17 00:00:00 2001 From: "F. G. Dorais" Date: Thu, 21 May 2026 14:51:14 -0400 Subject: [PATCH 10/10] fix: public instances --- UnicodeBasic/Types.lean | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index e154528d1..b0bda106c 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -279,10 +279,10 @@ public def toAbbrev! (x : GC) : String := open Std.Format Repr in public def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) -instance : Repr GC where reprPrec +public instance : Repr GC where reprPrec public def toString (x : GC) := " | ".intercalate (reprAux x) -instance : ToString GC where toString +public instance : ToString GC where toString public def ofAbbrev? (s : String.Slice) : Option GC := match s.chars.take 3 |>.toList with