Skip to content

Commit af4df72

Browse files
committed
output invalid pinyin the same as input
1 parent 26f22e0 commit af4df72

File tree

2 files changed

+22
-36
lines changed

2 files changed

+22
-36
lines changed

cppinyin/csrc/cppinyin.cc

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,13 @@ void PinyinEncoder::Cut(const std::string &str,
232232
fail_bytes = 0;
233233
for (const auto &value : values_[std::get<2>(route[i])]) {
234234
auto value_t = value;
235+
auto it = tone_to_normal_.find(value);
236+
if (it == tone_to_normal_.end()) {
237+
ostrs->push_back(value_t);
238+
continue;
239+
}
235240
if (tone == "normal") {
236-
if (tone_to_normal_.find(value) != tone_to_normal_.end()) {
237-
value_t = tone_to_normal_.at(value);
238-
} else {
239-
std::cerr << "PinyinEncoder: " << value
240-
<< " is not in the NORMAL_TO_TONE map. " << std::endl;
241-
}
241+
value_t = it->second;
242242
}
243243
if (partial) {
244244
auto initial = GetInitial(value_t);
@@ -359,13 +359,9 @@ void PinyinEncoder::LoadVocab(std::istream &is) {
359359
std::vector<std::string> values;
360360
while (iss >> value) {
361361
// Always convert to tone in internal
362-
if (!std::isdigit(value.back())) {
363-
if (NORMAL_TO_TONE.find(value) == NORMAL_TO_TONE.end()) {
364-
std::cerr << "PinyinEncoder: " << value
365-
<< " is not in the NORMAL_TO_TONE map. " << std::endl;
366-
} else {
367-
value = NORMAL_TO_TONE.at(value);
368-
}
362+
auto it = NORMAL_TO_TONE.find(value);
363+
if (it != NORMAL_TO_TONE.end()) {
364+
value = it->second;
369365
}
370366
values.push_back(value);
371367
}
@@ -450,7 +446,8 @@ std::string PinyinEncoder::ToInitial(const std::string &s) const {
450446
return s;
451447
}
452448
if (tone_to_normal_.count(s) == 0 && NORMAL_TO_TONE.count(s) == 0) {
453-
std::cerr << "ToInitial: " << s << " is not a valid pinyin. " << std::endl;
449+
// std::cerr << "ToInitial: " << s << " is not a valid pinyin. " <<
450+
// std::endl;
454451
return std::string();
455452
}
456453
return GetInitial(s);
@@ -477,7 +474,7 @@ std::string PinyinEncoder::ToFinal(const std::string &s,
477474
} else if (NORMAL_TO_TONE.find(s) != NORMAL_TO_TONE.end()) {
478475
value = NORMAL_TO_TONE.at(s);
479476
} else {
480-
std::cerr << "ToFinal: " << s << " is not a valid pinyin. " << std::endl;
477+
// std::cerr << "ToFinal: " << s << " is not a valid pinyin. " << std::endl;
481478
return std::string();
482479
}
483480
if (tone == "none") {
@@ -551,13 +548,9 @@ size_t PinyinEncoder::LoadValues(std::istream &ifile) {
551548
for (uint32_t j = 0; j < sub_size; ++j) {
552549
offset += ReadString(ifile, &value);
553550
// Always convert to number tone in internal
554-
if (!std::isdigit(value.back())) {
555-
if (NORMAL_TO_TONE.find(value) == NORMAL_TO_TONE.end()) {
556-
std::cerr << "PinyinEncoder: " << value
557-
<< " is not in the NORMAL_TO_TONE map. " << std::endl;
558-
} else {
559-
value = NORMAL_TO_TONE.at(value);
560-
}
551+
auto it = NORMAL_TO_TONE.find(value);
552+
if (it != NORMAL_TO_TONE.end()) {
553+
value = it->second;
561554
}
562555
values_[i][j] = value;
563556
}

cppinyin/python/cppinyin/cli.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,7 @@ def build(output: Path, dict_path: Path, user_dict_path: Path):
110110
"""
111111
output = Path(output)
112112
output.parent.mkdir(parents=True, exist_ok=True)
113-
114113
encoder = Encoder(get_dict_path(dict_path, user_dict_path))
115-
116114
encoder.save(str(output))
117115

118116

@@ -127,11 +125,10 @@ def build(output: Path, dict_path: Path, user_dict_path: Path):
127125
"--user-dict-path", type=Path, help="The path to user customized dict."
128126
)
129127
@click.option(
130-
"--no-tone",
131-
is_flag=True,
132-
show_default=True,
133-
default=False,
134-
help="Whether to include tones in output pinyins or not.",
128+
"--tone",
129+
type=click.Choice(["normal", "number", "none"]),
130+
default="number",
131+
help="Choose the tone style of pinyin",
135132
)
136133
@click.option(
137134
"--partial",
@@ -144,7 +141,7 @@ def encode(
144141
input: str,
145142
dict_path: Path,
146143
user_dict_path: Path,
147-
no_tone: bool,
144+
tone: str,
148145
partial: bool,
149146
):
150147
"""
@@ -161,14 +158,10 @@ def encode(
161158
with open(input, "r") as fi:
162159
for line in fi:
163160
pinyin = " ".join(
164-
encoder.encode(
165-
line.strip(), tone=not no_tone, partial=partial
166-
)
161+
encoder.encode(line.strip(), tone=tone, partial=partial)
167162
)
168163
click.echo(f"{line.strip()}\t{pinyin}")
169164
else:
170165
click.echo(
171-
" ".join(
172-
encoder.encode(input.strip(), tone=not no_tone, partial=partial)
173-
)
166+
" ".join(encoder.encode(input.strip(), tone=tone, partial=partial))
174167
)

0 commit comments

Comments
 (0)