Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 72 additions & 218 deletions UnicodeBasic.lean

Large diffs are not rendered by default.

18 changes: 8 additions & 10 deletions UnicodeBasic/CharacterDatabase.lean
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ Released under Apache 2.0 license as described in the file LICENSE.
-/
module

public section

namespace Unicode

/-- Stream type for Unicode Character Database (UCD) files
Expand All @@ -15,31 +13,31 @@ namespace Unicode
around field values are not significant. Line comments are prefixed with a
number sign `#` (U+0023).
-/
structure UCDStream extends String.Slice where
public structure UCDStream extends String.Slice where
/-- `isUnihan` is true if the records are tab separated -/
isUnihan := false
deriving Inhabited

namespace UCDStream

/-- Make a `UCDStream` from a string slice -/
def ofStringSlice (str : String.Slice) : UCDStream := { str with }
public def ofStringSlice (str : String.Slice) : UCDStream := { str with }

/-- Make a `UCDStream` from a string -/
def ofString (str : String) : UCDStream := ofStringSlice str.toSlice
public def ofString (str : String) : UCDStream := ofStringSlice str.toSlice

/-- Make a `UCDStream` from a substring -/
def ofSubstring (str : Substring.Raw) : UCDStream := ofStringSlice str.toString.toSlice
public def ofSubstring (str : Substring.Raw) : UCDStream := ofStringSlice str.toString.toSlice

/-- Make a `UCDStream` from a file -/
def ofFile (path : System.FilePath) : IO UCDStream :=
public def ofFile (path : System.FilePath) : IO UCDStream :=
ofString <$> IO.FS.readFile path

/-- Get the next line from the `UCDStream`

Line comments are stripped and blank lines are skipped.
-/
protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × UCDStream) := do
public protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × UCDStream) := do
let line := stream.trimAsciiEnd.takeWhile (.!='\n')
if h : line.rawEndPos < stream.rawEndPos then
let nextPos := stream.posGT line.rawEndPos h
Expand All @@ -54,13 +52,13 @@ protected partial def nextLine? (stream : UCDStream) : Option (String.Slice × U

Spaces around field values are trimmed.
-/
protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStream) := do
public protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStream) := do
let sep := if stream.isUnihan then "\t" else ";"
let mut arr := #[]
let (line, table) ← stream.nextLine?
for item in line.split sep do
arr := arr.push item.trimAscii
return (arr, table)

instance : Std.Stream UCDStream (Array String.Slice) where
public instance : Std.Stream UCDStream (Array String.Slice) where
next? := UCDStream.next?
60 changes: 29 additions & 31 deletions UnicodeBasic/Hangul.lean
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,65 @@ Released under Apache 2.0 license as described in the file LICENSE.
-/
module

public section

namespace Unicode.Hangul

def shortNameL : Array String :=
public def shortNameL : Array String :=
#["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]

def shortNameV : Array String :=
public def shortNameV : Array String :=
#["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"]

def shortNameT : Array String :=
public def shortNameT : Array String :=
#["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
"SS", "NG", "J", "C", "K", "T", "P", "H"]

abbrev sizeL := shortNameL.size -- 19
abbrev sizeV := shortNameV.size -- 21
abbrev sizeT := shortNameT.size -- 28
abbrev sizeLV := sizeL * sizeV -- 399
abbrev sizeVT := sizeV * sizeT -- 588
abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172
public abbrev sizeL := shortNameL.size -- 19
public abbrev sizeV := shortNameV.size -- 21
public abbrev sizeT := shortNameT.size -- 28
public abbrev sizeLV := sizeL * sizeV -- 399
public abbrev sizeVT := sizeV * sizeT -- 588
public abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172

abbrev baseL : UInt32 := 0x1100
abbrev baseV : UInt32 := 0x1161
abbrev baseT : UInt32 := 0x11A7
public abbrev baseL : UInt32 := 0x1100
public abbrev baseV : UInt32 := 0x1161
public abbrev baseT : UInt32 := 0x11A7

/-- Number of Hangul syllables -/
def Syllable.size := sizeLVT
public def Syllable.size := sizeLVT

/-- Starting code point for Hangul syllables -/
def Syllable.base : UInt32 := 0xAC00
public def Syllable.base : UInt32 := 0xAC00

/-- Stopping code point for Hangul syllables -/
def Syllable.last : UInt32 := ⟨0xAC00 + Syllable.size - 1, by decide⟩
public def Syllable.last : UInt32 := ⟨0xAC00 + Syllable.size - 1, by decide⟩

/-- Hangul syllable type -/
structure Syllable where
public structure Syllable where
/-- L part of syllable -/
toL : Fin sizeL
public toL : Fin sizeL
/-- V part of syllable -/
toV : Fin sizeV
public toV : Fin sizeV
/-- T part of syllable (0 means none) -/
toT : Fin sizeT := ⟨0, by decide⟩
public toT : Fin sizeT := ⟨0, by decide⟩
deriving DecidableEq, Repr

instance : Inhabited Syllable where
public instance : Inhabited Syllable where
default := ⟨⟨0, by decide⟩, ⟨0, by decide⟩, ⟨0, by decide⟩⟩

/-- LV part of syllable -/
def Syllable.toLV : Syllable → Fin sizeLV
public def Syllable.toLV : Syllable → Fin sizeLV
| ⟨⟨l, hl⟩, ⟨v, hv⟩, _⟩ =>
have : l * sizeV + v < sizeLV := calc
_ < l * sizeV + sizeV := by apply Nat.add_lt_add_left; exact hv
_ = (l + 1) * sizeV := by rw [Nat.add_one_mul]
_ ≤ sizeL * sizeV := by apply Nat.mul_le_mul_right; exact hl
⟨l * sizeV + v, this⟩

def Syllable.index (s : Syllable) : Fin Syllable.size :=
public def Syllable.index (s : Syllable) : Fin Syllable.size :=
match s.toLV, s.toT with
| ⟨lv, hlv⟩, ⟨t, ht⟩ =>
have : lv * sizeT + t < sizeL * sizeV * sizeT := calc
Expand All @@ -75,27 +73,27 @@ def Syllable.index (s : Syllable) : Fin Syllable.size :=

/-- Get short name for Hangul syllable -/
@[inline]
def Syllable.getShortName (s : Syllable) : String := shortNameL[s.toL] ++ shortNameV[s.toV] ++ shortNameT[s.toT]
public def Syllable.getShortName (s : Syllable) : String := shortNameL[s.toL] ++ shortNameV[s.toV] ++ shortNameT[s.toT]

/-- Get L part character for syllable -/
def Syllable.getLChar (s : Syllable) : Char :=
public def Syllable.getLChar (s : Syllable) : Char :=
.ofNat (baseL.toNat + s.toL)

/-- Get V part character for syllable -/
def Syllable.getVChar (s : Syllable) : Char :=
public def Syllable.getVChar (s : Syllable) : Char :=
.ofNat (baseV.toNat + s.toV)

/-- Get LV part character for syllable -/
def Syllable.getLVChar (s : Syllable) : Char :=
public def Syllable.getLVChar (s : Syllable) : Char :=
.ofNat (base.toNat + sizeT * s.toLV)

/-- Get T part character for syllable -/
def Syllable.getTChar? (s : Syllable) : Option Char :=
public def Syllable.getTChar? (s : Syllable) : Option Char :=
if s.toT.val == 0 then none else
return .ofNat (baseT.toNat + s.toT)

/-- Get Hangul syllable from code point -/
def getSyllable? (code : UInt32) : Option Syllable :=
public def getSyllable? (code : UInt32) : Option Syllable :=
if code < 0xAC00 then none else
let index := (code - 0xAC00).toNat
if h : index ≥ Syllable.size then none else
Expand All @@ -111,7 +109,7 @@ def getSyllable? (code : UInt32) : Option Syllable :=
some ⟨⟨lpart, lislt⟩, ⟨vpart, vislt⟩, ⟨tpart, tislt⟩⟩

@[inherit_doc getSyllable?]
def getSyllable! (code : UInt32) : Syllable :=
public def getSyllable! (code : UInt32) : Syllable :=
match getSyllable? code with
| some s => s
| none => panic! "not a Hangul syllable"
Loading