|
| 1 | +/** |
| 2 | + * @Time : 2025/3/18 10:57 |
| 3 | + * @File : classify_text.go |
| 4 | + * @Software: dev_clip |
| 5 | + * @Author : Mr.Fang |
| 6 | + * @Description: 文本内容分类 |
| 7 | + */ |
| 8 | + |
| 9 | +package internal |
| 10 | + |
| 11 | +import ( |
| 12 | + "fmt" |
| 13 | + "regexp" |
| 14 | + "strings" |
| 15 | +) |
| 16 | + |
| 17 | +const ( |
| 18 | + CHINESE = iota // 中文 |
| 19 | + CAMEL_CASE |
| 20 | + SNAKE_CASE |
| 21 | + CODE_SNIPPET |
| 22 | + ENGLISH |
| 23 | + OTHER |
| 24 | +) |
| 25 | + |
| 26 | +// 纯中文检测 |
| 27 | +func isChinese(text string) bool { |
| 28 | + re := regexp.MustCompile(`^[\p{Han}]+$`) |
| 29 | + return re.MatchString(text) |
| 30 | +} |
| 31 | + |
| 32 | +// IsCamelCase 驼峰命名检测 |
| 33 | +func IsCamelCase(text string) bool { |
| 34 | + // 小驼峰 |
| 35 | + re := regexp.MustCompile(`^[a-z]+(?:[A-Z][a-z]*)+$`) |
| 36 | + match := re.MatchString(text) |
| 37 | + if !match { |
| 38 | + // 大驼峰 |
| 39 | + re = regexp.MustCompile(`^[A-Z][a-z]+(?:[A-Z][a-z]*)+$`) |
| 40 | + return re.MatchString(text) |
| 41 | + } |
| 42 | + return match |
| 43 | +} |
| 44 | + |
| 45 | +// 下划线命名检测 |
| 46 | +func isSnakeCase(text string) bool { |
| 47 | + re := regexp.MustCompile(`^[a-z]+(?:_[a-z]+)+$`) |
| 48 | + return re.MatchString(text) |
| 49 | +} |
| 50 | + |
| 51 | +// IsCodeSnippet 代码片段检测(通过特殊符号判断) |
| 52 | +func IsCodeSnippet(text string) bool { |
| 53 | + codeChars := []string{"{", "}", ";", "(", ")", "[", "]", "<", ">", "=", "\""} |
| 54 | + for _, char := range codeChars { |
| 55 | + if strings.Contains(text, char) { |
| 56 | + // [A-Za-z]+\( |
| 57 | + if char == "(" { |
| 58 | + re := regexp.MustCompile(`[A-Za-z]+\(+`) |
| 59 | + return re.MatchString(text) |
| 60 | + } |
| 61 | + return true |
| 62 | + } |
| 63 | + } |
| 64 | + // 第二种情况;空格拆分只有一个长度,并且中间有.连接 |
| 65 | + if len(strings.Split(text, " ")) == 1 && strings.Contains(text, ".") { |
| 66 | + _, b := ExtractURLs(text) |
| 67 | + if b { |
| 68 | + return false |
| 69 | + } |
| 70 | + return true |
| 71 | + } |
| 72 | + return false |
| 73 | +} |
| 74 | + |
| 75 | +// IsEnglish 纯英文检测(排除驼峰命名、下划线命名、代码片段) |
| 76 | +func IsEnglish(text string) bool { |
| 77 | + if IsCamelCase(text) || isSnakeCase(text) || IsCodeSnippet(text) { |
| 78 | + return false |
| 79 | + } |
| 80 | + re := regexp.MustCompile(`^[a-zA-Z\s.,!?'’]+$`) |
| 81 | + if re.MatchString(text) { |
| 82 | + return true |
| 83 | + } |
| 84 | + return IsEnglishContent(text) |
| 85 | +} |
| 86 | + |
| 87 | +// IsEnglishContent 内容检测,只要英文占比超过 60% |
| 88 | +func IsEnglishContent(text string) bool { |
| 89 | + re := regexp.MustCompile(`^[a-zA-Z\s.,!?'’]+$`) |
| 90 | + split := strings.Split(text, " ") |
| 91 | + l := len(split) |
| 92 | + count := 0 |
| 93 | + for _, s := range split { |
| 94 | + if re.MatchString(s) { |
| 95 | + count++ |
| 96 | + } |
| 97 | + } |
| 98 | + percentage, _ := CalcPercentage(float64(count), float64(l)) |
| 99 | + return percentage > 60 |
| 100 | +} |
| 101 | + |
| 102 | +// CalcPercentage 计算百分比 numerator 分母 denominator 分子 |
| 103 | +func CalcPercentage(numerator, denominator float64) (int, error) { |
| 104 | + if denominator == 0 { |
| 105 | + return 0, fmt.Errorf("除数为 0 ") |
| 106 | + } |
| 107 | + percentage := (numerator / denominator) * 100 |
| 108 | + return int(percentage), nil |
| 109 | +} |
| 110 | + |
| 111 | +// ExtractURLs 提取 URL,并检测是否全是 URL |
| 112 | +func ExtractURLs(text string) ([]string, bool) { |
| 113 | + re := regexp.MustCompile(`https?://[^\s]+`) |
| 114 | + urls := re.FindAllString(text, -1) |
| 115 | + return urls, len(urls) > 0 && len(urls[0]) == len(text) |
| 116 | +} |
| 117 | + |
| 118 | +// ClassifyText 分类文本 |
| 119 | +func ClassifyText(text string) int { |
| 120 | + if isChinese(text) { |
| 121 | + fmt.Println("纯中文内容") |
| 122 | + return CHINESE |
| 123 | + } else if IsCamelCase(text) { |
| 124 | + fmt.Println("驼峰命名") |
| 125 | + return CAMEL_CASE |
| 126 | + } else if isSnakeCase(text) { |
| 127 | + fmt.Println("下划线命名") |
| 128 | + return SNAKE_CASE |
| 129 | + } else if IsCodeSnippet(text) { |
| 130 | + fmt.Println("代码片段") |
| 131 | + return CODE_SNIPPET |
| 132 | + } else if IsEnglish(text) { |
| 133 | + fmt.Println("纯英文内容") |
| 134 | + return ENGLISH |
| 135 | + } else { |
| 136 | + fmt.Println("无法分类的内容") |
| 137 | + return OTHER |
| 138 | + } |
| 139 | +} |
0 commit comments