From 3cb6a86b140f622229ba624cad17e80bf2e64883 Mon Sep 17 00:00:00 2001 From: obadx Date: Mon, 16 Feb 2026 14:55:53 +0200 Subject: [PATCH 1/6] =?UTF-8?q?=D8=AE=D8=B7=D8=A3:=20=D8=A5=D8=B5=D9=84?= =?UTF-8?q?=D8=A7=D8=AD=20=D8=A7=D9=84=D8=AA=D9=86=D8=A7=D8=B8=D8=B1=20?= =?UTF-8?q?=D9=81=D9=8A=20=D8=A7=D9=84=D9=82=D9=84=D9=82=D8=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- .../phonetics/conv_base_operation.py | 42 +++++++++- .../phonetics/error_explainer.py | 76 ++++++------------- src/quran_transcript/phonetics/operations.py | 3 +- .../phonetics/tajweed_rulses.py | 3 + tests/test_explain_error_api.py | 5 +- tests/test_sub_with_mapping.py | 5 +- tests/test_sub_with_mapping_pytest.py | 55 ++++++++++++++ uv.lock | 2 +- 9 files changed, 135 insertions(+), 58 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1571a55..d600596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] license = "MIT" name = "quran-transcript" -version = "0.4.0" +version = "0.4.1" authors = [ { name="Abdullah", email="abdullahamlyossef@gmail.com" }, ] diff --git a/src/quran_transcript/phonetics/conv_base_operation.py b/src/quran_transcript/phonetics/conv_base_operation.py index 2c5f719..f1ceae6 100644 --- a/src/quran_transcript/phonetics/conv_base_operation.py +++ b/src/quran_transcript/phonetics/conv_base_operation.py @@ -441,6 +441,35 @@ def get_mappings( new_mappings = merge_mappings(mappings, new_mappings) + # Special case where skoon sign is repaced with qalalah sign + # We want the qalqlah sign associated with the letter it self not the + # Did not want that but no way to solve exept with this + for re_out in re.finditer( + f"[^{alph.uthmani.ras_haaa}{alph.uthmani.shadda}]({alph.phonetics.qlqla})", + new_text, + ): + qlq_idx = re_out.span(1)[0] + char_idx = qlq_idx - 1 + # getting skon or shadda idx in the merged mappings + m_idx = 0 + for m_idx in range(len(new_mappings)): + if new_mappings[m_idx].pos[0] == qlq_idx: + break + # Avodig the case where we have qalqlah at the end with no (shadda or skonJ) + if new_mappings[m_idx - 1].tajweed_rules is None: + print("here") + new_mappings[m_idx - 1].pos = ( + new_mappings[m_idx - 1].pos[0], + new_mappings[m_idx].pos[1], + ) + new_mappings[m_idx - 1].tajweed_rules = new_mappings[m_idx].tajweed_rules + new_mappings[m_idx].pos = ( + new_mappings[m_idx].pos[1], + new_mappings[m_idx].pos[1], + ) + new_mappings[m_idx].deleted = True + new_mappings[m_idx].tajweed_rules = None + # TODO: remove this curr_m = None next_m = None @@ -579,6 +608,7 @@ def sub_with_mapping( class ConversionOperation: regs: list[tuple[str, str]] | tuple[str, str] arabic_name: str + tajweed_rules: list[TajweedRule | None] | TajweedRule | None = None ops_before: list["ConversionOperation"] | None = None def __post_init__(self): @@ -588,14 +618,22 @@ def __post_init__(self): if self.ops_before is None: self.ops_before = [] + if self.tajweed_rules: + if not isinstance(self.tajweed_rules, list): + self.tajweed_rules = [self.tajweed_rules] + else: + self.tajweed_rules = [None for _ in range(len(self.regs))] + def forward( self, text, moshaf: MoshafAttributes, mappings: MappingListType | None = None, ) -> tuple[str, MappingListType]: - for input_reg, out_reg in self.regs: - text, mappings = sub_with_mapping(input_reg, out_reg, text, mappings) + for (input_reg, out_reg), taj_rule in zip(self.regs, self.tajweed_rules): + text, mappings = sub_with_mapping( + input_reg, out_reg, text, mappings, tajweed_rule=taj_rule + ) return text, mappings def apply( diff --git a/src/quran_transcript/phonetics/error_explainer.py b/src/quran_transcript/phonetics/error_explainer.py index dde5b1d..ee8f0fc 100644 --- a/src/quran_transcript/phonetics/error_explainer.py +++ b/src/quran_transcript/phonetics/error_explainer.py @@ -9,7 +9,7 @@ from .tajweed_rulses import TajweedRule, NormalMaddRule -from .conv_base_operation import MappingPos +from .conv_base_operation import MappingPos, MappingListType @dataclass @@ -77,46 +77,47 @@ def align_phonemes_groups( def extract_ref_phonetic_to_uthmani( - mappings: list[MappingPos | None], + mappings: MappingListType, ) -> dict[int, int]: ref_ph_to_uthmani = {} for idx, map_pos in enumerate(mappings): - if map_pos is not None: - for ph_idx in range(*map_pos.pos): - if ph_idx in ref_ph_to_uthmani: - raise ValueError( - f"Same phonetic scripts has multiple uthmani script. Phonetic posision: `{ph_idx}`, Uthmani Poses: `{ref_ph_to_uthmani[ph_idx]}, {idx}`" - ) - else: - ref_ph_to_uthmani[ph_idx] = idx + for ph_idx in range(*map_pos.pos): + if ph_idx in ref_ph_to_uthmani: + raise ValueError( + f"Same phonetic scripts has multiple uthmani script. Phonetic posision: `{ph_idx}`, Uthmani Poses: `{ref_ph_to_uthmani[ph_idx]}, {idx}`" + ) + else: + ref_ph_to_uthmani[ph_idx] = idx return ref_ph_to_uthmani def get_ref_phonetic_groups_tajweed_rules( ref_ph_groups: list[str], - mappings: list[MappingPos | None], + mappings: MappingListType, ref_ph_to_uthmani: dict[int, int], -) -> list[None | list[TajweedRule]]: - ref_tajweed_rules = [None] * len(ref_ph_groups) +) -> list[list[TajweedRule]]: + ref_tajweed_rules: list[TajweedRule] = [[] for _ in range(len(ref_ph_groups))] start = 0 end = 0 - # Computing Tajweed rules - # TODO: O(n^2) too bad should be O(n) + # Computing Tajweed rules O(len(ref_ph_text)) for ph_g_idx, ph_g in enumerate(ref_ph_groups): end += len(ph_g) - for map_pos in mappings: - if map_pos is not None: - if start >= map_pos.pos[0] and end <= map_pos.pos[1]: - if ref_tajweed_rules[ph_g_idx] is None: - ref_tajweed_rules[ph_g_idx] = map_pos.tajweed_rules - else: - ref_tajweed_rules[ph_g_idx] += map_pos.tajweed_rules + used_uth_ids = set() + for ph_idx in range(start, end): + uth_idx = ref_ph_to_uthmani[ph_idx] + if uth_idx not in used_uth_ids: + used_uth_ids.add(uth_idx) + if mappings[uth_idx].tajweed_rules: + ref_tajweed_rules[ph_g_idx].extend(mappings[uth_idx].tajweed_rules) start = end return ref_tajweed_rules def explain_error( - uthmani_text, ref_ph_text, predicted_ph_text, mappings: list[MappingPos | None] + uthmani_text: str, + ref_ph_text: str, + predicted_ph_text: str, + mappings: MappingListType, ) -> list[ReciterError]: """ """ ref_ph_groups = chunck_phonemes(ref_ph_text) @@ -171,7 +172,7 @@ def explain_error( elif align.op_type == "replace": pred_rules = [] - if ref_ph_groups_tajweed_rules[align.ref_idx] is not None: + if ref_ph_groups_tajweed_rules[align.ref_idx]: for taj_rule in ref_ph_groups_tajweed_rules[align.ref_idx]: pred_taj_rule = taj_rule.get_relvant_rule(pred_ph) if pred_taj_rule is not None: @@ -283,30 +284,3 @@ def explain_error( ref_ph_start = ref_ph_end return errors - - -if __name__ == "__main__": - uthmani_text = "قالوا" - ph_text = "قاالۥۥ" - predicted_text = "كالۥۥ" - predicted_text = "فكالۥۥ" - predicted_text = "فكۥۥلۥۥ" - - normal_madd_alif = NormalMaddRule(tag="alif") - normal_madd_waw = NormalMaddRule(tag="waw") - - mapping = [ - MappingPos(pos=(0, 1)), - MappingPos(pos=(1, 3), tajweed_rules=[normal_madd_alif]), - MappingPos(pos=(3, 4)), - MappingPos(pos=(4, 6), tajweed_rules=[normal_madd_waw]), - None, - ] - errors = explain_error( - uthmani_text=uthmani_text, - ref_ph_text=ph_text, - predicted_ph_text=predicted_text, - mappings=mapping, - ) - for err in errors: - print(err) diff --git a/src/quran_transcript/phonetics/operations.py b/src/quran_transcript/phonetics/operations.py index fbf5d57..7a56836 100644 --- a/src/quran_transcript/phonetics/operations.py +++ b/src/quran_transcript/phonetics/operations.py @@ -12,7 +12,7 @@ from .moshaf_attributes import MoshafAttributes from ..alphabet import uthmani as uth from ..alphabet import phonetics as ph -from .tajweed_rulses import NormalMaddRule +from .tajweed_rulses import NormalMaddRule, Qalqalah, TajweedRule @dataclass @@ -868,6 +868,7 @@ class Qalqla(ConversionOperation): f"([{uth.qlqla_group}](?:{uth.shadda}$|{uth.ras_haaa}|$))", r"\1" + ph.qlqla, ) + tajweed_rules: TajweedRule = field(default_factory=lambda: Qalqalah()) ops_before: list[ConversionOperation] = field( default_factory=lambda: [ CleanEnd(), diff --git a/src/quran_transcript/phonetics/tajweed_rulses.py b/src/quran_transcript/phonetics/tajweed_rulses.py index 4b36822..450f2fb 100644 --- a/src/quran_transcript/phonetics/tajweed_rulses.py +++ b/src/quran_transcript/phonetics/tajweed_rulses.py @@ -54,6 +54,9 @@ class Qalqalah(TajweedRule): golden_len: int = 0 correctness_type: Literal["match", "count"] = "match" + def match(self, ref_text, pred_text) -> bool: + return ref_text == pred_text + def is_ph_str_in(self, ph_str: str) -> bool: """Whether the phonetic script is assoicated with this Tajweed rule or not""" return True diff --git a/tests/test_explain_error_api.py b/tests/test_explain_error_api.py index 1992555..7caa1f7 100644 --- a/tests/test_explain_error_api.py +++ b/tests/test_explain_error_api.py @@ -17,8 +17,10 @@ predicted_text = "كالۥۥ" predicted_text = "فكالۥۥ" - predicted_text = "فكۥۥلۥۥ" + # predicted_text = "فكۥۥلۥۥ" + print(ref_ph_out.phonemes) + print(predicted_text) errors = explain_error( uthmani_text=uthmani_text, ref_ph_text=ref_ph_out.phonemes, @@ -27,3 +29,4 @@ ) for err in errors: print(err) + print("-" * 50) diff --git a/tests/test_sub_with_mapping.py b/tests/test_sub_with_mapping.py index 5b35970..9d40b91 100644 --- a/tests/test_sub_with_mapping.py +++ b/tests/test_sub_with_mapping.py @@ -20,6 +20,7 @@ aya = Aya(2, 6) aya = Aya(2, 7) aya = Aya(27, 62) + aya = Aya(112, 3) # aya = Aya(3, 1) # aya = Aya(30, 28) # aya = Aya(2, 9) @@ -30,10 +31,12 @@ # uth_text = "غِشَـٰوَةٌۭ وَلَهُمْ" # uth_text = "قَلِيلًۭا مِّمَّا" # uth_text = "أَمَّن يُجِيبُ" + # uth_text = "قَرِيبٌ" + uth_text = "ٱلْحَقُّ" profiler = Profiler() profiler.start() - ph_out = quran_phonetizer(uth_text, moshaf, remove_spaces=True) + ph_out = quran_phonetizer(uth_text, moshaf) profiler.stop() ph_text = ph_out.phonemes print(uth_text) diff --git a/tests/test_sub_with_mapping_pytest.py b/tests/test_sub_with_mapping_pytest.py index 1a9e903..2cda8b1 100644 --- a/tests/test_sub_with_mapping_pytest.py +++ b/tests/test_sub_with_mapping_pytest.py @@ -994,6 +994,61 @@ def test_merge_mappings_complex_range(self): ), ], ), + ( + "لَمْ يَلِدْ وَلَمْ يُولَدْ", + "لَم يَلِدڇ وَلَم يُۥۥلَدڇ", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 3), tajweed_rules=None, deleted=True), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 5), tajweed_rules=None, deleted=False), + MappingPos(pos=(5, 6), tajweed_rules=None, deleted=False), + MappingPos(pos=(6, 7), tajweed_rules=None, deleted=False), + MappingPos(pos=(7, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(8, 10), tajweed_rules=[Qalqalah()]), + MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), + MappingPos(pos=(10, 11), tajweed_rules=None, deleted=False), + MappingPos(pos=(11, 12), tajweed_rules=None, deleted=False), + MappingPos(pos=(12, 13), tajweed_rules=None, deleted=False), + MappingPos(pos=(13, 14), tajweed_rules=None, deleted=False), + MappingPos(pos=(14, 15), tajweed_rules=None, deleted=False), + MappingPos(pos=(15, 16), tajweed_rules=None, deleted=False), + MappingPos(pos=(16, 16), tajweed_rules=None, deleted=True), + MappingPos(pos=(16, 17), tajweed_rules=None, deleted=False), + MappingPos(pos=(17, 18), tajweed_rules=None, deleted=False), + MappingPos(pos=(18, 19), tajweed_rules=None, deleted=False), + MappingPos( + pos=(19, 21), + tajweed_rules=[ + NormalMaddRule( + tag="waw", + ) + ], + ), + MappingPos(pos=(21, 22), tajweed_rules=None, deleted=False), + MappingPos(pos=(22, 23), tajweed_rules=None, deleted=False), + MappingPos( + pos=(23, 25), + tajweed_rules=[Qalqalah()], + ), + MappingPos(pos=(25, 25), tajweed_rules=None, deleted=True), + ], + ), + ( + "قَرِيبٌ", + "قَرِۦۦۦۦبڇ", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(8, 10), tajweed_rules=[Qalqalah()]), + MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), + ], + ), ], ) def test_phonetizer_with_mappings( diff --git a/uv.lock b/uv.lock index 67decf0..b6c8696 100644 --- a/uv.lock +++ b/uv.lock @@ -557,7 +557,7 @@ wheels = [ [[package]] name = "quran-transcript" -version = "0.4.0" +version = "0.4.1" source = { editable = "." } dependencies = [ { name = "fuzzysearch" }, From 63a37a3a2927b495934c2ba70bc293be455da1b7 Mon Sep 17 00:00:00 2001 From: obadx Date: Mon, 16 Feb 2026 17:15:31 +0200 Subject: [PATCH 2/6] add: Support for Tashkeel Errors --- .../phonetics/error_explainer.py | 58 ++++++++++++++----- tests/test_explain_error_api.py | 18 ++++-- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/src/quran_transcript/phonetics/error_explainer.py b/src/quran_transcript/phonetics/error_explainer.py index ee8f0fc..15c2751 100644 --- a/src/quran_transcript/phonetics/error_explainer.py +++ b/src/quran_transcript/phonetics/error_explainer.py @@ -22,8 +22,10 @@ class ReciterError: preditected_ph: str expected_len: Optional[int] | None = None predicted_len: Optional[int] | None = None - tajweed_rules: Optional[list[TajweedRule]] | None = None - predicted_tajweed_rules: Optional[list[TajweedRule]] | None = None + ref_tajweed_rules: Optional[list[TajweedRule]] | None = None + inserted_tajweed_rules: Optional[list[TajweedRule]] | None = None + replaced_tajweed_rules: Optional[list[TajweedRule]] | None = None + missing_tajweed_rules: Optional[list[TajweedRule]] | None = None @dataclass @@ -131,16 +133,22 @@ def explain_error( ) # Aligning Phonemes groups using first chat of every one - alignmets = align_phonemes_groups(ref_ph_groups, pred_ph_groups) + alignments = align_phonemes_groups(ref_ph_groups, pred_ph_groups) errors = [] pred_ph_start = 0 ref_ph_start = 0 pred_ph_end = 0 ref_ph_end = 0 - for align in alignmets: - ref_ph = ref_ph_groups[align.ref_idx] - pred_ph = pred_ph_groups[align.pred_idx] + for align in alignments: + if align.ref_idx < len(ref_ph_groups): + ref_ph = ref_ph_groups[align.ref_idx] + else: + ref_ph = "" + if align.pred_idx < len(pred_ph_groups): + pred_ph = pred_ph_groups[align.pred_idx] + else: + pred_ph = "" pred_ph_end = pred_ph_start + len(pred_ph) if align.op_type != "insert": @@ -151,6 +159,7 @@ def explain_error( ) ph_pos = (ref_ph_start, ref_ph_end) else: + # TODO: Make the uthmani posision more precise. Now we bound it to the aligments uthmani_pos = ( ref_ph_to_uthmani[ref_ph_start], ref_ph_to_uthmani[ref_ph_start], @@ -193,8 +202,8 @@ def explain_error( preditected_ph=pred_ph, expected_len=ref_len, predicted_len=pred_len, - tajweed_rules=[taj_rule], - predicted_tajweed_rules=[pred_taj_rule], + ref_tajweed_rules=[taj_rule], + replaced_tajweed_rules=[pred_taj_rule], ) ) else: @@ -206,7 +215,7 @@ def explain_error( speech_error_type="replace", expected_ph=ref_ph, preditected_ph=pred_ph, - tajweed_rules=[taj_rule], + ref_tajweed_rules=[taj_rule], ) ) @@ -240,16 +249,17 @@ def explain_error( if ref_ph == pred_ph: ... # We have Tajweed rule - elif ref_ph_groups_tajweed_rules[align.ref_idx] is not None: + elif ref_ph_groups_tajweed_rules[align.ref_idx]: for taj_rule in ref_ph_groups_tajweed_rules[align.ref_idx]: exp_len = None pred_len = None + missing_taj_rules = None if taj_rule.correctness_type == "count": pred_len = taj_rule.count(ref_ph, pred_ph) exp_len = taj_rule.golden_len - # TODO: What to do with `match` elif taj_rule.correctness_type == "match": - ... + if not taj_rule.match(ref_ph, pred_ph): + missing_taj_rules = [taj_rule] else: raise ValueError( f"Invalid mathing type: `{taj_rule.correctness_type}`. Available: `match`, `count`" @@ -264,14 +274,30 @@ def explain_error( preditected_ph=pred_ph, expected_len=exp_len, predicted_len=pred_len, - tajweed_rules=[taj_rule], + ref_tajweed_rules=[taj_rule], + missing_tajweed_rules=missing_taj_rules, ) ) # Tashkeel (Harakat) - # TODO: - elif ref_ph_groups[align.ref_idx][-1] in alph.phonetic_groups.harakat: - ... + elif ref_ph[-1] in alph.phonetic_groups.harakat: + if len(pred_ph) > len(ref_ph): + sp_tp = "insert" + elif len(pred_ph) < len(ref_ph): + sp_tp = "delete" + else: + sp_tp = "replace" + + errors.append( + ReciterError( + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + error_type="tashkeel", + speech_error_type=sp_tp, + expected_ph=ref_ph, + preditected_ph=pred_ph, + ) + ) # TODO: imala, sakt, dammao momala elif ref_ph_groups[align.ref_idx][-1] in alph.phonetic_groups.residuals: diff --git a/tests/test_explain_error_api.py b/tests/test_explain_error_api.py index 7caa1f7..a75dcf6 100644 --- a/tests/test_explain_error_api.py +++ b/tests/test_explain_error_api.py @@ -4,8 +4,6 @@ if __name__ == "__main__": - uthmani_text = "قَالُوٓا۟" - moshaf = MoshafAttributes( rewaya="hafs", madd_monfasel_len=4, @@ -13,12 +11,18 @@ madd_mottasel_waqf=4, madd_aared_len=4, ) - ref_ph_out = quran_phonetizer(uthmani_text, moshaf) + uthmani_text = "قَالُوٓا۟" predicted_text = "كالۥۥ" predicted_text = "فكالۥۥ" - # predicted_text = "فكۥۥلۥۥ" + predicted_text = "فكۥۥلۥۥ" + + uthmani_text = "ٱلْحَقُّ" + predicted_text = "ءَلحَقق" + predicted_text = "ءَلحقق" + predicted_text = "ءَلحُقق" + ref_ph_out = quran_phonetizer(uthmani_text, moshaf) print(ref_ph_out.phonemes) print(predicted_text) errors = explain_error( @@ -28,5 +32,11 @@ mappings=ref_ph_out.mappings, ) for err in errors: + print( + f"UTH: `{uthmani_text[err.uthmani_pos[0] : err.uthmani_pos[1]]}`, {err.uthmani_pos}" + ) + print( + f"PH: `{ref_ph_out.phonemes[err.ph_pos[0] : err.ph_pos[1]]}`, {err.ph_pos}" + ) print(err) print("-" * 50) From 9263c32dcbf354c47d597cd1a7b4953e08d0520c Mon Sep 17 00:00:00 2001 From: obadx Date: Mon, 16 Feb 2026 20:00:56 +0200 Subject: [PATCH 3/6] =?UTF-8?q?=D8=A5=D8=B6=D8=A7=D9=81=D8=A9=20=D8=A7?= =?UTF-8?q?=D9=84=D9=85=D8=AF=D9=88=D8=AF=20=D9=81=D9=8A=20=D8=B4=D8=B1?= =?UTF-8?q?=D8=AD=20=D8=A7=D9=84=D8=A3=D8=AE=D8=B7=D8=A7=D8=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- .../phonetics/conv_base_operation.py | 15 ++++ src/quran_transcript/phonetics/operations.py | 30 +++++++- .../phonetics/tajweed_rulses.py | 74 ++++++++++++++++++- tests/test_sub_with_mapping.py | 12 +-- tests/test_sub_with_mapping_pytest.py | 35 ++++++++- 6 files changed, 158 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d600596..24ced46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] license = "MIT" name = "quran-transcript" -version = "0.4.1" +version = "0.5.0" authors = [ { name="Abdullah", email="abdullahamlyossef@gmail.com" }, ] diff --git a/src/quran_transcript/phonetics/conv_base_operation.py b/src/quran_transcript/phonetics/conv_base_operation.py index f1ceae6..e6025ef 100644 --- a/src/quran_transcript/phonetics/conv_base_operation.py +++ b/src/quran_transcript/phonetics/conv_base_operation.py @@ -406,6 +406,21 @@ def get_mappings( # TODO: remove this assert all(m is not None for m in new_mappings) + # Special Case where we want to assgin the tag for Leen Madd + for m_idx in range(len(new_mappings)): + if new_mappings[m_idx].tajweed_rules: + for taj_idx in range(len(new_mappings[m_idx].tajweed_rules)): + if ( + new_mappings[m_idx].tajweed_rules[taj_idx].name.en == "Leen Madd" + and new_mappings[m_idx].tajweed_rules[taj_idx].tag is None + ): + tag = ( + new_mappings[m_idx] + .tajweed_rules[taj_idx] + ._madd_to_tag[new_text[new_mappings[m_idx].pos[0]]] + ) + new_mappings[m_idx].tajweed_rules[taj_idx].tag = tag + # Special case where we have Idgham tanween # Special sympol `tanweed_idgham_detrminer` has no meaning moving it to the tanween for re_out in re.finditer(f"{alph.uthmani.tanween_idhaam_dterminer}[^$]", text): diff --git a/src/quran_transcript/phonetics/operations.py b/src/quran_transcript/phonetics/operations.py index 7a56836..1ca3afe 100644 --- a/src/quran_transcript/phonetics/operations.py +++ b/src/quran_transcript/phonetics/operations.py @@ -12,7 +12,17 @@ from .moshaf_attributes import MoshafAttributes from ..alphabet import uthmani as uth from ..alphabet import phonetics as ph -from .tajweed_rulses import NormalMaddRule, Qalqalah, TajweedRule +from .tajweed_rulses import ( + TajweedRule, + Qalqalah, + NormalMaddRule, + MonfaselMaddRule, + MottaselMaddPauseRule, + MottaselMaddRule, + LazemMaddRule, + AaredMaddRule, + LeenMaddRule, +) @dataclass @@ -769,6 +779,7 @@ def forward( r"\1" + ph.alif * moshaf.madd_monfasel_len + r"\2", text, mappings, + MonfaselMaddRule(golden_len=moshaf.madd_monfasel_len, tag="alif"), ) # normal for k, madd_patt in self.madd_map.items(): @@ -777,6 +788,9 @@ def forward( r"\1" + moshaf.madd_monfasel_len * madd_patt.target + r"\2", text, mappings, + MonfaselMaddRule( + golden_len=moshaf.madd_monfasel_len, tag=madd_patt.name + ), ) # المد المتصل وقفا @@ -790,6 +804,10 @@ def forward( + r"\2", text, mappings, + MottaselMaddPauseRule( + golden_len=max(moshaf.madd_mottasel_waqf, moshaf.madd_aared_len), + tag=madd_patt.name, + ), ) # المد المنفصل @@ -799,6 +817,9 @@ def forward( r"\1" + moshaf.madd_mottasel_len * madd_patt.target + r"\2", text, mappings, + MottaselMaddRule( + golden_len=moshaf.madd_mottasel_len, tag=madd_patt.name + ), ) # المد اللازم @@ -808,10 +829,13 @@ def forward( r"\1" + (moshaf.madd_yaa_alayn_alharfy - 1) * ph.yaa, text, mappings, + LeenMaddRule(golden_len=moshaf.madd_yaa_alayn_alharfy, tag="yaa"), ) # ميم آل عمران + meem_aal_imran_taj_rule = LazemMaddRule(tag="alif") if moshaf.meem_aal_imran == "wasl_2": meema_len = 2 + meem_aal_imran_taj_rule = NormalMaddRule(tag="alif") elif moshaf.meem_aal_imran == "wasl_6": meema_len = 6 else: @@ -821,6 +845,7 @@ def forward( r"\1" + ph.yaa_madd * meema_len + r"\2", text, mappings, + meem_aal_imran_taj_rule, ) for k, madd_patt in self.madd_map.items(): @@ -829,6 +854,7 @@ def forward( r"\1" + 6 * madd_patt.target + r"\2", text, mappings, + LazemMaddRule(tag=madd_patt.name), ) # المد العارض للسكون @@ -838,6 +864,7 @@ def forward( r"\1" + moshaf.madd_aared_len * madd_patt.target + r"\2", text, mappings, + AaredMaddRule(golden_len=moshaf.madd_aared_len, tag=madd_patt.name), ) # مد اللين @@ -846,6 +873,7 @@ def forward( r"\1" + (moshaf.madd_alleen_len - 1) * r"\2" + r"\3", text, mappings, + LeenMaddRule(golden_len=moshaf.madd_alleen_len), ) # المد الطبيعي diff --git a/src/quran_transcript/phonetics/tajweed_rulses.py b/src/quran_transcript/phonetics/tajweed_rulses.py index 450f2fb..e48a1f8 100644 --- a/src/quran_transcript/phonetics/tajweed_rulses.py +++ b/src/quran_transcript/phonetics/tajweed_rulses.py @@ -63,7 +63,12 @@ def is_ph_str_in(self, ph_str: str) -> bool: def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: """Returs a Tajweed rule that is assocaited with the input ph_str""" - return self + if not ph_str: + return None + elif ph_str[-1] == alph.phonetics.qlqla: + return self + else: + return None @dataclass @@ -98,7 +103,7 @@ def is_ph_str_in(self, ph_str: str) -> bool: def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: """Returs a Tajweed rule that is assocaited with the input ph_str""" if not ph_str: - raise ValueError("Empty String") + return None elif ph_str[0] not in self._madd_to_tag: return None return replace(self, tag=self._madd_to_tag[ph_str[0]]) @@ -112,6 +117,71 @@ class NormalMaddRule(MaddRule): golden_len: int = 2 +@dataclass +class MonfaselMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد المنفصل", en="Monfasel Madd") + ) + golden_len: int = 4 + + +@dataclass +class MottaselMaddPauseRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName( + ar="المد المتصل وقفا", en="Mottasel Madd at Pause" + ) + ) + golden_len: int = 4 + + +@dataclass +class MottaselMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد المتصل", en="Mottasel Madd") + ) + golden_len: int = 4 + + +@dataclass +class LazemMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد اللازم", en="Lazem Madd") + ) + golden_len: int = 6 + + +@dataclass +class AaredMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد العارض للسكون", en="Aared Madd") + ) + golden_len: int = 4 + + +@dataclass +class LeenMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="مد اللين", en="Leen Madd") + ) + golden_len: int = 4 + + def __post_init__(self): + self.available_tags = {"waw", "yaa"} + super().__post_init__() + self._madd_to_tag = { + alph.phonetics.waw: "waw", + alph.phonetics.yaa: "yaa", + } + + def count(self, ref_text, pred_text) -> int: + # The case where we have Tashkeel after madd (Error from the model) + if pred_text[-1] != pred_text[0]: + return pred_text[:-1].count(ref_text[0]) + 1 + else: + return pred_text.count(ref_text[0]) + 1 + + # TODO: """ diff --git a/tests/test_sub_with_mapping.py b/tests/test_sub_with_mapping.py index 9d40b91..7182dc5 100644 --- a/tests/test_sub_with_mapping.py +++ b/tests/test_sub_with_mapping.py @@ -17,13 +17,14 @@ aya = Aya(2, 1) aya = Aya(19, 1) # aya = Aya(75, 27) - aya = Aya(2, 6) - aya = Aya(2, 7) - aya = Aya(27, 62) - aya = Aya(112, 3) + # aya = Aya(2, 6) + # aya = Aya(2, 7) + # aya = Aya(27, 62) + # aya = Aya(112, 3) # aya = Aya(3, 1) # aya = Aya(30, 28) # aya = Aya(2, 9) + aya = Aya(106, 1) uth_text = aya.get().uthmani # uth_text = aya.get_by_imlaey_words(start=7, window=2).uthmani @@ -32,7 +33,7 @@ # uth_text = "قَلِيلًۭا مِّمَّا" # uth_text = "أَمَّن يُجِيبُ" # uth_text = "قَرِيبٌ" - uth_text = "ٱلْحَقُّ" + # uth_text = "ٱلْحَقُّ" profiler = Profiler() profiler.start() @@ -42,6 +43,7 @@ print(uth_text) print(ph_out.phonemes) print(ph_out.mappings) + print("*" * 40) for idx, uth_c in enumerate(uth_text): print(f"UTH_IDX: `{idx}`, SPAN: `{ph_out.mappings[idx]}`") ph_c = "" diff --git a/tests/test_sub_with_mapping_pytest.py b/tests/test_sub_with_mapping_pytest.py index 2cda8b1..6564960 100644 --- a/tests/test_sub_with_mapping_pytest.py +++ b/tests/test_sub_with_mapping_pytest.py @@ -17,7 +17,11 @@ clean_uthmani_spaces, ) -from quran_transcript.phonetics.tajweed_rulses import NormalMaddRule, Qalqalah +from quran_transcript.phonetics.tajweed_rulses import ( + NormalMaddRule, + Qalqalah, + LeenMaddRule, +) # Import the sub_with_mapping function from the existing test file @@ -1049,6 +1053,35 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), ], ), + ( + "لِإِيلَـٰفِ قُرَيْشٍ", + "لِءِۦۦلَاافِ قُرَيييش", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 6), tajweed_rules=[NormalMaddRule(tag="yaa")]), + MappingPos(pos=(6, 7), tajweed_rules=None, deleted=False), + MappingPos(pos=(7, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(8, 8), tajweed_rules=None, deleted=True), + MappingPos(pos=(8, 10), tajweed_rules=[NormalMaddRule(tag="alif")]), + MappingPos(pos=(10, 11), tajweed_rules=None, deleted=False), + MappingPos(pos=(11, 12), tajweed_rules=None, deleted=False), + MappingPos(pos=(12, 13), tajweed_rules=None, deleted=False), + MappingPos(pos=(13, 14), tajweed_rules=None, deleted=False), + MappingPos(pos=(14, 15), tajweed_rules=None, deleted=False), + MappingPos(pos=(15, 16), tajweed_rules=None, deleted=False), + MappingPos(pos=(16, 17), tajweed_rules=None, deleted=False), + MappingPos( + pos=(17, 20), + tajweed_rules=[LeenMaddRule(golden_len=4, tag="yaa")], + ), + MappingPos(pos=(20, 20), tajweed_rules=None, deleted=True), + MappingPos(pos=(20, 21), tajweed_rules=None, deleted=False), + MappingPos(pos=(21, 21), tajweed_rules=None, deleted=True), + ], + ), ], ) def test_phonetizer_with_mappings( From c59ef1bcce06bcd1729d1ba8a4b586093ded31e9 Mon Sep 17 00:00:00 2001 From: obadx Date: Tue, 17 Feb 2026 13:24:56 +0200 Subject: [PATCH 4/6] add: New index after Qalqlah bug --- quran-script/ph_index.npy | Bin 4366532 -> 4366532 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/quran-script/ph_index.npy b/quran-script/ph_index.npy index ebef823f7e33e2afbd199a3415d497098717e45b..e88716b4b8f57b979e97c3b86cabada26efa0bf4 100644 GIT binary patch delta 695 zcmW-e%WD%+7{$*dwN27qV;;?`og|$}qPA+a4;xcsoCqSgP=pFLQrt)p8bK66is0fi zp)Xt7MY@rKJC}lRL2wa?1nHtXx4N_b0i_Cxo|fUam>*~EH{U(2=Ax@bUWg$saZ86B zkxuCnk9fr=-O?jUj*4H7NkD=UlCbnjM4}RtxFn=c`o)x_q~51Ll+`qD;ueB1;KOvi zT~3n$-7Z^uL#l*vOdyOdbfTocJhYxAR2E63>7sBWt8Xk=O+%S{i_^u?0aKex))hmg zkw8CPA9HD4T|RI&O_I)J&Tf2MvHq$sM~C^O;6hlpUs?MhRm3EUbb}mR)Z1^Y(OI>N zPiWEI=8#?8?X&AWY7L86rkjQzYkDhU$31EtOIV?sK>+LebJCvZP&GWkQ@VT1)#_f` zCK-|?IdYtwAScNn86v01X>x`Ple6R;86l%&jEs}>WI}uG{DNO?LQumnFVc@qy)}|= z-c>=vFxq=4oaLkoDkJZW_doMa$IA8R~};5IUO1+znl9HLcg7I delta 695 zcmW-e&r1|x9LAs7HFedu&7J+d?yNiGuBDabk2E)RXM+ej6di<`kS-yC8$=O=1RcIL z_$v{G#X|_)ItIZDK?h@F2@l=6)-C)4#Dbz8m*I1m56`^c`MytmcR5fe&&7~{1f@le zNvpI;yM&}eI;BgL9G9@1kZy^HDN%_@ToRI$l%ypiJ(3kma_@UTRFsDsxQPf1binf) zm0pt9jf#CRsLB|_IHG7nE6V!I1N&)O6=9*5E`cD5`udW+Ybc9vDY_I|VCn3Nec4cX zq|rl{VJ@$$t4FSn#-htI*H`ati}LXpPw4J4=ldbY zA$_Dkilm>MB&Wy#86>C48FH2kk#poc873oSl#G!JWL$@wiKVdGhTtWJc#&aj>z(0= z-8;%O3?pa^p~%%uzq&r5tLqo5-_$Iw;3}^-%k-@Nd2jMstJ=aMp3&v_{gywoFiA?} z5}6{?WQLUW%!2#+kxFonZs9+Fjh& Date: Tue, 17 Feb 2026 13:25:55 +0200 Subject: [PATCH 5/6] =?UTF-8?q?=D8=A5=D8=B6=D8=A7=D9=81=D8=A9=20=D9=82?= =?UTF-8?q?=D9=88=D8=A7=D8=B9=D8=AF=20=D8=A7=D9=84=D9=85=D8=AF=D9=88=D8=AF?= =?UTF-8?q?=20=D9=88=D8=A7=D9=84=D8=A5=D8=B5=D8=AF=D8=A7=D8=B1=200.5.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- .../phonetics/conv_base_operation.py | 24 ++-- .../phonetics/error_explainer.py | 51 ++++++-- src/quran_transcript/phonetics/operations.py | 22 ++-- .../phonetics/tajweed_rulses.py | 117 +++++++++++++++++- tests/test_explain_error_api.py | 4 + tests/test_sub_with_mapping.py | 8 +- tests/test_sub_with_mapping_pytest.py | 62 ++++++++-- uv.lock | 2 +- 9 files changed, 244 insertions(+), 48 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 24ced46..298216f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] license = "MIT" name = "quran-transcript" -version = "0.5.0" +version = "0.5.1" authors = [ { name="Abdullah", email="abdullahamlyossef@gmail.com" }, ] diff --git a/src/quran_transcript/phonetics/conv_base_operation.py b/src/quran_transcript/phonetics/conv_base_operation.py index e6025ef..e5b2a96 100644 --- a/src/quran_transcript/phonetics/conv_base_operation.py +++ b/src/quran_transcript/phonetics/conv_base_operation.py @@ -472,7 +472,6 @@ def get_mappings( break # Avodig the case where we have qalqlah at the end with no (shadda or skonJ) if new_mappings[m_idx - 1].tajweed_rules is None: - print("here") new_mappings[m_idx - 1].pos = ( new_mappings[m_idx - 1].pos[0], new_mappings[m_idx].pos[1], @@ -621,9 +620,12 @@ def sub_with_mapping( @dataclass class ConversionOperation: - regs: list[tuple[str, str]] | tuple[str, str] + regs: ( + list[tuple[str, str, TajweedRule] | tuple[str, str]] + | tuple[str, str, TajweedRule] + | tuple[str, str] + ) arabic_name: str - tajweed_rules: list[TajweedRule | None] | TajweedRule | None = None ops_before: list["ConversionOperation"] | None = None def __post_init__(self): @@ -633,19 +635,21 @@ def __post_init__(self): if self.ops_before is None: self.ops_before = [] - if self.tajweed_rules: - if not isinstance(self.tajweed_rules, list): - self.tajweed_rules = [self.tajweed_rules] - else: - self.tajweed_rules = [None for _ in range(len(self.regs))] - def forward( self, text, moshaf: MoshafAttributes, mappings: MappingListType | None = None, ) -> tuple[str, MappingListType]: - for (input_reg, out_reg), taj_rule in zip(self.regs, self.tajweed_rules): + for reg in self.regs: + if len(reg) == 2: + input_reg, out_reg = reg + taj_rule = None + elif len(reg) == 3: + input_reg, out_reg, taj_rule = reg + else: + raise ValueError("Invalid Input") + text, mappings = sub_with_mapping( input_reg, out_reg, text, mappings, tajweed_rule=taj_rule ) diff --git a/src/quran_transcript/phonetics/error_explainer.py b/src/quran_transcript/phonetics/error_explainer.py index 15c2751..80b9fa9 100644 --- a/src/quran_transcript/phonetics/error_explainer.py +++ b/src/quran_transcript/phonetics/error_explainer.py @@ -115,6 +115,30 @@ def get_ref_phonetic_groups_tajweed_rules( return ref_tajweed_rules +def get_tasshkeel_error( + ref_ph: str, + pred_ph: str, + uthmani_pos: tuple[int, int], + ph_pos: tuple[int, int], +) -> ReciterError: + if len(pred_ph) > len(ref_ph): + sp_tp = "insert" + elif len(pred_ph) < len(ref_ph): + sp_tp = "delete" + else: + sp_tp = "replace" + + err = ReciterError( + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + error_type="tashkeel", + speech_error_type=sp_tp, + expected_ph=ref_ph, + preditected_ph=pred_ph, + ) + return err + + def explain_error( uthmani_text: str, ref_ph_text: str, @@ -278,24 +302,27 @@ def explain_error( missing_tajweed_rules=missing_taj_rules, ) ) + if ( + ref_ph[-1] in alph.phonetic_groups.harakat + and pred_ph[-1] != ref_ph[-1] + ): + errors.append( + get_tasshkeel_error( + ref_ph=ref_ph, + pred_ph=pred_ph, + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + ) + ) # Tashkeel (Harakat) elif ref_ph[-1] in alph.phonetic_groups.harakat: - if len(pred_ph) > len(ref_ph): - sp_tp = "insert" - elif len(pred_ph) < len(ref_ph): - sp_tp = "delete" - else: - sp_tp = "replace" - errors.append( - ReciterError( + get_tasshkeel_error( + ref_ph=ref_ph, + pred_ph=pred_ph, uthmani_pos=uthmani_pos, ph_pos=ph_pos, - error_type="tashkeel", - speech_error_type=sp_tp, - expected_ph=ref_ph, - preditected_ph=pred_ph, ) ) diff --git a/src/quran_transcript/phonetics/operations.py b/src/quran_transcript/phonetics/operations.py index 1ca3afe..9dc0363 100644 --- a/src/quran_transcript/phonetics/operations.py +++ b/src/quran_transcript/phonetics/operations.py @@ -22,6 +22,7 @@ LazemMaddRule, AaredMaddRule, LeenMaddRule, + IdghamKamel, ) @@ -281,7 +282,7 @@ class BeginWithSaken(ConversionOperation): @dataclass class ConvertAlifMaksora(ConversionOperation): arabic_name: str = "تحويل الأف المقصورة إله: حضف أو ألف أو ياء" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # حذف الأف المقصورة من الاسم المقصور النكرة ( @@ -372,7 +373,7 @@ class RemoveSkoonMostadeer(ConversionOperation): @dataclass class SkoonMostateel(ConversionOperation): arabic_name: str = "ضبط السكون المستطيل" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # remove from the middle ( @@ -391,7 +392,7 @@ class SkoonMostateel(ConversionOperation): @dataclass class MaddAlewad(ConversionOperation): arabic_name: str = "ضبط مد العوض وسطا ووقفا" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # remove from the middle ( @@ -421,7 +422,7 @@ class EnlargeSmallLetters(ConversionOperation): arabic_name: str = ( "تكبير الألف والياء والاو والنون الصغار مع حذف مد الصلة عند الوقف" ) - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # small alif ( @@ -489,7 +490,7 @@ class NormalizeTaa(ConversionOperation): ] ) arabic_name: str = "تحويب التاء المربطة في الوسط لتاء وفي الآخر لهاء" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ (f"{uth.taa_marboota}$", f"{uth.haa}"), (f"{uth.taa_marboota}", f"{uth.taa_mabsoota}"), @@ -524,7 +525,7 @@ class PrepareGhonnaIdghamIqlab(ConversionOperation): ] ) arabic_name: str = "فك الإقلاب والعغنة الإدغام" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # النون المقلبة ميمام ( @@ -588,6 +589,7 @@ class PrepareGhonnaIdghamIqlab(ConversionOperation): ( f"([{uth.fatha}{uth.dama}]{uth.yaa}|[{uth.fatha}{uth.kasra}]{uth.waw}|[{uth.pure_letters_without_yaa_and_waw_group}]){uth.space}?([{uth.pure_letters_group}]{uth.shadda})", r"\2", + # IdghamKamel(), ), ] ) @@ -601,7 +603,7 @@ class IltiqaaAlsaknan(ConversionOperation): ] ) arabic_name: str = "التقاء الساكنان وكسر التنوين" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str] | tuple[str, str, TajweedRule]] = field( default_factory=lambda: [ # كسر التنوين ( @@ -892,11 +894,11 @@ def forward( @dataclass class Qalqla(ConversionOperation): arabic_name: str = "إضافة علامة القلقة" - regs: tuple[str, str] = ( + regs: tuple[str, str, TajweedRule] = ( f"([{uth.qlqla_group}](?:{uth.shadda}$|{uth.ras_haaa}|$))", r"\1" + ph.qlqla, + Qalqalah(), ) - tajweed_rules: TajweedRule = field(default_factory=lambda: Qalqalah()) ops_before: list[ConversionOperation] = field( default_factory=lambda: [ CleanEnd(), @@ -907,7 +909,7 @@ class Qalqla(ConversionOperation): @dataclass class RemoveRasHaaAndShadda(ConversionOperation): arabic_name: str = "حذف السكون والشدة م تكرار الحرف المشدد" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # shadda ( diff --git a/src/quran_transcript/phonetics/tajweed_rulses.py b/src/quran_transcript/phonetics/tajweed_rulses.py index e48a1f8..d3f3026 100644 --- a/src/quran_transcript/phonetics/tajweed_rulses.py +++ b/src/quran_transcript/phonetics/tajweed_rulses.py @@ -96,7 +96,7 @@ def count(self, ref_text, pred_text) -> int: def is_ph_str_in(self, ph_str: str) -> bool: """Whether the phonetic script is assoicated with this Tajweed rule or not""" if ph_str: - return ph_str[0] in self._madd_to_tags + return ph_str[0] in self._madd_to_tag else: return False @@ -182,6 +182,121 @@ def count(self, ref_text, pred_text) -> int: return pred_text.count(ref_text[0]) + 1 +@dataclass +class IdghamKamel(TajweedRule): + name: LangName = field( + default_factory=lambda: LangName(ar="إدغام كامل", en="Full Merging") + ) + golden_len: int = 0 + correctness_type: Literal["match", "count"] = "match" + + def match(self, ref_text, pred_text) -> bool: + return ref_text == pred_text + + def is_ph_str_in(self, ph_str: str) -> bool: + """Whether the phonetic script is assoicated with this Tajweed rule or not""" + return True + + def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: + """Returs a Tajweed rule that is assocaited with the input ph_str""" + return None + + +@dataclass +class GhonnahMetadata: + name: LangName + tag: str + offset: int = 0 + + +@dataclass +class Ghonnah(TajweedRule): + name: LangName + golden_len: int = 4 + correctness_type: Literal["match", "count"] = "count" + offset: int = 0 + + def __post_init__(self): + self.available_tags = { + "noon", + "noon_yaa", + "noon_waw", + "noon_mokhfah", + "meem", + "meem_mokhfah", + } + super().__post_init__() + self._ph_to_metadata = { + alph.phonetics.noon: GhonnahMetadata( + name=field( + default_factory=lambda: LangName( + ar="النون المشددة أو المدغمة", en="Moshadad or Modgham Noon" + ) + ), + tag="noon", + offset=0, + ), + alph.phonetics.yaa: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_yaa", + offset=1, + ), + alph.phonetics.waw: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_waw", + offset=1, + ), + alph.phonetics.noon_mokhfah: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_mokhfah", + offset=1, + ), + alph.phonetics.meem: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="meem", + offset=0, + ), + alph.phonetics.meem_mokhfah: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="meem_mokhfah", + offset=0, + ), + } + + def count(self, ref_text, pred_text) -> int: + return pred_text.count(ref_text[0]) + self.offset + + def is_ph_str_in(self, ph_str: str) -> bool: + """Whether the phonetic script is assoicated with this Tajweed rule or not""" + if ph_str: + return ph_str[0] in self._ph_to_metadata + else: + return False + + def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: + """Returs a Tajweed rule that is assocaited with the input ph_str""" + if not ph_str: + return None + elif ph_str[0] not in self._ph_to_metadata: + return None + return replace( + self, + name=self._ph_to_metadata[ph_str[0]].name, + offset=self._ph_to_metadata[ph_str[0]].offset, + tag=self._ph_to_metadata[ph_str[0]].tag, + ) + + +@dataclass +class MoshaddadOrModghamNoonRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName( + ar="النون المشددة أو المدغمة", en="Moshaddad or ModghamNoon" + ) + ) + golden_len: int = 4 + + # TODO: """ diff --git a/tests/test_explain_error_api.py b/tests/test_explain_error_api.py index a75dcf6..f536ebb 100644 --- a/tests/test_explain_error_api.py +++ b/tests/test_explain_error_api.py @@ -22,6 +22,10 @@ predicted_text = "ءَلحقق" predicted_text = "ءَلحُقق" + # uthmani_text = "الٓمٓ" + # predicted_text = "ءَلِف لَااااااممممِۦۦۦۦۦۦم" + # predicted_text = "ءَلِف لَاااااممممِۦۦۦۦۦۦم" + ref_ph_out = quran_phonetizer(uthmani_text, moshaf) print(ref_ph_out.phonemes) print(predicted_text) diff --git a/tests/test_sub_with_mapping.py b/tests/test_sub_with_mapping.py index 7182dc5..202e1aa 100644 --- a/tests/test_sub_with_mapping.py +++ b/tests/test_sub_with_mapping.py @@ -13,9 +13,9 @@ ) aya = Aya() aya = Aya(1, 1) - # aya = Aya(12, 1) - aya = Aya(2, 1) - aya = Aya(19, 1) + aya = Aya(12, 1) + # aya = Aya(2, 1) + # aya = Aya(19, 1) # aya = Aya(75, 27) # aya = Aya(2, 6) # aya = Aya(2, 7) @@ -24,7 +24,7 @@ # aya = Aya(3, 1) # aya = Aya(30, 28) # aya = Aya(2, 9) - aya = Aya(106, 1) + # aya = Aya(106, 1) uth_text = aya.get().uthmani # uth_text = aya.get_by_imlaey_words(start=7, window=2).uthmani diff --git a/tests/test_sub_with_mapping_pytest.py b/tests/test_sub_with_mapping_pytest.py index 6564960..c36a83b 100644 --- a/tests/test_sub_with_mapping_pytest.py +++ b/tests/test_sub_with_mapping_pytest.py @@ -21,6 +21,9 @@ NormalMaddRule, Qalqalah, LeenMaddRule, + AaredMaddRule, + LazemMaddRule, + MottaselMaddRule, ) # Import the sub_with_mapping function from the existing test file @@ -643,7 +646,14 @@ def test_merge_mappings_complex_range(self): "ءَلِف لَاااااام رَاا تِلكَ ءَاايَااتُ لكِتَاابِ لمُبِۦۦۦۦن", [ MappingPos(pos=(0, 6), tajweed_rules=None), - MappingPos(pos=(6, 16), tajweed_rules=None), + MappingPos( + pos=(6, 16), + tajweed_rules=[ + LazemMaddRule( + tag="alif", + ) + ], + ), MappingPos(pos=(16, 16), deleted=True), MappingPos( pos=(16, 20), @@ -711,7 +721,15 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(50, 51), tajweed_rules=None), MappingPos(pos=(51, 52), tajweed_rules=None), MappingPos(pos=(52, 53), tajweed_rules=None), - MappingPos(pos=(53, 57), tajweed_rules=None), + MappingPos( + pos=(53, 57), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="yaa", + ) + ], + ), MappingPos(pos=(57, 58), tajweed_rules=None), MappingPos(pos=(58, 58), deleted=True), ], @@ -756,8 +774,16 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(27, 28), tajweed_rules=None), MappingPos(pos=(28, 29), tajweed_rules=None), MappingPos(pos=(29, 30), tajweed_rules=None), - MappingPos(pos=(30, 34), tajweed_rules=None), - MappingPos(pos=(34, 35), tajweed_rules=None), + MappingPos( + pos=(30, 34), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="yaa", + ) + ], + ), + MappingPos(pos=(34, 35), deleted=False), MappingPos(pos=(35, 35), deleted=True), ], ), @@ -809,7 +835,10 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(28, 29), tajweed_rules=None), MappingPos(pos=(29, 30), tajweed_rules=None), MappingPos(pos=(30, 31), tajweed_rules=None), - MappingPos(pos=(31, 35), tajweed_rules=None), + MappingPos( + pos=(31, 35), + tajweed_rules=[MottaselMaddRule(golden_len=4, tag="alif")], + ), MappingPos(pos=(35, 35), deleted=True), MappingPos(pos=(35, 36), tajweed_rules=None), MappingPos(pos=(36, 38), tajweed_rules=None), @@ -882,7 +911,15 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(93, 94), tajweed_rules=None), MappingPos(pos=(94, 95), tajweed_rules=None), MappingPos(pos=(95, 96), tajweed_rules=None), - MappingPos(pos=(96, 100), tajweed_rules=None), + MappingPos( + pos=(96, 100), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="waw", + ) + ], + ), MappingPos(pos=(100, 101), tajweed_rules=None), MappingPos(pos=(101, 101), deleted=True), ], @@ -923,9 +960,16 @@ def test_merge_mappings_complex_range(self): "ءَلِف لَااااااممممِۦۦۦۦۦۦم", [ MappingPos(pos=(0, 6), tajweed_rules=None, deleted=False), - MappingPos(pos=(6, 14), tajweed_rules=None, deleted=False), + MappingPos( + pos=(6, 14), + tajweed_rules=[LazemMaddRule(tag="alif")], + deleted=False, + ), MappingPos(pos=(14, 14), tajweed_rules=None, deleted=True), - MappingPos(pos=(14, 26), tajweed_rules=None, deleted=False), + MappingPos( + pos=(14, 26), + tajweed_rules=[LazemMaddRule(tag="yaa")], + ), MappingPos(pos=(26, 26), tajweed_rules=None, deleted=True), ], ), @@ -1048,7 +1092,7 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), - MappingPos(pos=(4, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 8), tajweed_rules=[AaredMaddRule(tag="yaa")]), MappingPos(pos=(8, 10), tajweed_rules=[Qalqalah()]), MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), ], diff --git a/uv.lock b/uv.lock index b6c8696..f7d108f 100644 --- a/uv.lock +++ b/uv.lock @@ -557,7 +557,7 @@ wheels = [ [[package]] name = "quran-transcript" -version = "0.4.1" +version = "0.5.1" source = { editable = "." } dependencies = [ { name = "fuzzysearch" }, From c90e0ab026a98ed688e57c0c0bdf1436be7cb7e2 Mon Sep 17 00:00:00 2001 From: obadx Date: Tue, 17 Feb 2026 13:44:47 +0200 Subject: [PATCH 6/6] =?UTF-8?q?=D8=A7=D9=84=D8=A5=D8=B5=D8=AF=D8=A7=D8=B1?= =?UTF-8?q?=200.5.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 122 ++++++++++++++++++ src/quran_transcript/__init__.py | 3 + .../phonetics/error_explainer.py | 90 ++++++++++++- tests/test_explain_error_api.py | 17 ++- 4 files changed, 224 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1cf7909..2c6d228 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,22 @@ [colab-url]: https://colab.research.google.com/drive/1d9-mVu2eiPOPS9z5sS2V4TQ579xIUBi-?usp=sharing # `quran-transcript` package +## 🆕 ما الجديد في الإصدار 0.5.1 (What's New in Version 0.5.1) + +### 🎯 تحليل أخطاء التلاوة (Recitation Error Analysis) +- إضافة دالة `explain_error` لمقارنة النص الصوتي المتوقع (المرجع) مع النص الصوتي المُتنبأ به (مثلاً من قارئ أو نموذج تعلم آلي). +- توفير تحليل تفصيلي للأخطاء يشمل: + - نوع الخطأ: تجويدي (`tajweed`)، عادي (`normal`)، أو حركات (`tashkeel`). + - نوع الخطأ الكلامي: إدراج (`insert`)، حذف (`delete`)، أو استبدال (`replace`). + - قواعد التجويد المرتبطة بالخطأ (مثل المد، القلقلة، الغنة) مع تحديد الطول المتوقع والفعلي عند الاقتضاء. +- تمثيل النتائج باستخدام كائن `ReciterError` الذي يحتوي على معلومات دقيقة عن موقع الخطأ في النص العثماني والصوتي. +- هذه الأداة مفيدة لتقييم أداء قراء القرآن، وتحليل أخطاء نماذج التعرف على الكلام، وتقديم تغذية راجعة للمتعلمين. + + + + + + ## 🆕 ما الجديد في الإصدار 0.4.0 (What's New in Version 0.4.0) @@ -253,6 +269,112 @@ print(f"النص العثماني: {uthmani_text}") - `start`: موقع بداية المطابقة - `end`: موقع نهاية المطابقة (غير شامل) +### 📖 مثال على تحليل أخطاء التلاوة (Error Analysis Example) + +```python +from quran_transcript import ( + quran_phonetizer, + MoshafAttributes, + ReciterError, + explain_error, +) + +# إعداد خصائص المصحف +moshaf = MoshafAttributes( + rewaya="hafs", + madd_monfasel_len=4, + madd_mottasel_len=4, + madd_mottasel_waqf=4, + madd_aared_len=4, +) + +# النص العثماني الأصلي +uthmani_text = "قَالُوٓا۟" + +# نص صوتي متوقع (مرجعي) +ref_out = quran_phonetizer(uthmani_text, moshaf) +print("المرجع:", ref_out.phonemes) + +# نص صوتي مُتنبأ به (به أخطاء) +predicted_text = "فكۥۥلۥۥ" +print("المتنبأ به:", predicted_text) + +# تحليل الأخطاء +errors = explain_error( + uthmani_text=uthmani_text, + ref_ph_text=ref_out.phonemes, + predicted_ph_text=predicted_text, + mappings=ref_out.mappings, +) + +# عرض النتائج +for err in errors: + print("\n" + "="*50) + print(f"الموقع في العثماني: `{uthmani_text[err.uthmani_pos[0]:err.uthmani_pos[1]]}`, {err.uthmani_pos}") + print(f"الموقع في الصوتي: `{ref_out.phonemes[err.ph_pos[0]:err.ph_pos[1]]}`, {err.ph_pos}") + print(f"نوع الخطأ: {err.error_type} - {err.speech_error_type}") + print(f"المتوقع: '{err.expected_ph}' - المُتنبأ به: '{err.preditected_ph}'") + if err.ref_tajweed_rules: + for rule in err.ref_tajweed_rules: + print(f" قاعدة تجويد مرجعية: {rule.name.ar} ({rule.name.en})") + if err.replaced_tajweed_rules: + for rule in err.replaced_tajweed_rules: + print(f" قاعدة تجويد مستبدلة: {rule.name.ar} ({rule.name.en})") + if err.missing_tajweed_rules: + for rule in err.missing_tajweed_rules: + print(f" قاعدة تجويد مفقودة: {rule.name.ar} ({rule.name.en})") +``` + +**مخرجات متوقعة (Partial output):** +``` +المرجع: قَاالُۥۥ +المتنبأ به: فكۥۥلۥۥ + +================================================== +الموقع في العثماني: ``, (0, 0) +الموقع في الصوتي: ``, (0, 0) +نوع الخطأ: normal - insert +المتوقع: '' - المُتنبأ به: 'ف' + +================================================== +الموقع في العثماني: `قَ`, (0, 2) +الموقع في الصوتي: `قَ`, (0, 2) +نوع الخطأ: normal - replace +المتوقع: 'قَ' - المُتنبأ به: 'ك' + +================================================== +الموقع في العثماني: `ا`, (2, 3) +الموقع في الصوتي: `اا`, (2, 4) +نوع الخطأ: tajweed - replace +المتوقع: 'اا' - المُتنبأ به: 'ۥۥ' + قاعدة تجويد مرجعية: المد الطبيعي (Normal Madd) + قاعدة تجويد مستبدلة: المد الطبيعي (Normal Madd) + +================================================== +الموقع في العثماني: `لُ`, (3, 5) +الموقع في الصوتي: `لُ`, (4, 6) +نوع الخطأ: tashkeel - delete +المتوقع: 'لُ' - المُتنبأ به: 'ل' +``` + +--- + +### 📦 كائنات تحليل الأخطاء (Error Analysis Dataclasses) + +#### `ReciterError` +يمثل خطأ واحد في التلاوة: +- `uthmani_pos`: tuple[int, int] – موقع الخطأ في النص العثماني (بداية، نهاية). +- `ph_pos`: tuple[int, int] – موقع الخطأ في النص الصوتي المرجعي (بداية، نهاية). +- `error_type`: Literal["tajweed", "normal", "tashkeel"] – نوع الخطأ. +- `speech_error_type`: Literal["insert", "delete", "replace"] – نوع الخطأ الكلامي. +- `expected_ph`: str – المقطع الصوتي المتوقع. +- `preditected_ph`: str – المقطع الصوتي المُتنبأ به. +- `expected_len`: Optional[int] – الطول المتوقع (لأخطاء المد مثلاً). +- `predicted_len`: Optional[int] – الطول الفعلي. +- `ref_tajweed_rules`: Optional[list[TajweedRule]] – قواعد التجويد المرتبطة بالمقطع المتوقع. +- `inserted_tajweed_rules`, `replaced_tajweed_rules`, `missing_tajweed_rules`: Optional[list[TajweedRule]] – قواعد التجويد التي تم إدراجها أو استبدالها أو فقدانها. + + ### الحروف: (43) diff --git a/src/quran_transcript/__init__.py b/src/quran_transcript/__init__.py index 740c68a..6c33af3 100644 --- a/src/quran_transcript/__init__.py +++ b/src/quran_transcript/__init__.py @@ -23,6 +23,7 @@ NoPhonemesSearchResult, PhoneticSearch, ) +from .phonetics.error_explainer import explain_error, ReciterError from . import alphabet as alphabet @@ -54,4 +55,6 @@ "PhonmesSearhResult", "NoPhonemesSearchResult", "PhoneticSearch", + "explain_error", + "ReciterError", ] diff --git a/src/quran_transcript/phonetics/error_explainer.py b/src/quran_transcript/phonetics/error_explainer.py index 80b9fa9..3b3974a 100644 --- a/src/quran_transcript/phonetics/error_explainer.py +++ b/src/quran_transcript/phonetics/error_explainer.py @@ -145,7 +145,95 @@ def explain_error( predicted_ph_text: str, mappings: MappingListType, ) -> list[ReciterError]: - """ """ + """Explain errors in a predicted phonetic transcription compared to the reference. + + This function performs a detailed alignment between the reference (correct) phonetic + transcription and a predicted transcription (e.g., from a speech recognition system + or a learner's recitation). It breaks both strings into phoneme groups (using + `chunck_phonemes`) and aligns them using Levenshtein opcodes on the first character + of each group. For each aligned group, it checks for: + - Insertions, deletions, or substitutions of whole groups. + - Tajweed rule violations (e.g., incorrect Madd length, missing Qalqalah or Ghonnah). + - Mismatches in short vowels (harakat) or other diacritics. + - Special phonetic marks (Imala, Sakt, etc.) – currently placeholders. + + The function uses the provided `mappings` to locate each error in the original + Uthmani text and to associate Tajweed rules with reference phoneme groups. + The result is a list of `ReciterError` objects that can be used for feedback, + error analysis, or pronunciation training. + + Args: + uthmani_text: The original Uthmani script text (used to locate the error source). + ref_ph_text: The reference phonetic string (correct recitation), as produced by + `quran_phonetizer` or a similar function. + predicted_ph_text: The predicted phonetic string to be evaluated. + mappings: A list of `MappingPos` objects that link each Uthmani character to its + corresponding range(s) in the reference phonetic string. This mapping + must cover the entire `ref_ph_text` and be consistent (no phonetic index + maps to two different Uthmani indices). + + Returns: + A list of `ReciterError` dataclass instances, each describing a single error. + The list is ordered by the occurrence of errors along the phonetic sequence. + Each error contains: + - Uthmani and phonetic positions (start, end) where the error occurs. + - Error type: "tajweed", "normal", or "tashkeel". + - Speech error type: "insert", "delete", or "replace". + - Expected and predicted phonetic substrings. + - For tajweed errors: expected/predicted lengths (if applicable) and the + relevant Tajweed rules (reference, replaced, missing). + Errors are not merged; every mismatch in a phoneme group produces at least one error. + + Raises: + ValueError: If the same phonetic index is mapped to multiple Uthmani indices + (inconsistent mapping) or if an unsupported `correctness_type` is + encountered in a Tajweed rule. + + Examples: + Basic usage with a single word: + >>> moshaf = MoshafAttributes(...) + >>> uth_text = "قَالُوٓا۟" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "كالۥۥ" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... print(err.error_type, err.speech_error_type, err.expected_ph, err.preditected_ph) + tajweed replace اااااا ۥۥ + tashkeel delete لُ ل + + Example showing a missing Qalqalah: + >>> uth_text = "ٱلْحَقُّ" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "ءَلحقق" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... if err.error_type == 'tajweed' and err.ref_tajweed_rules: + ... print(err.ref_tajweed_rules[0].name.en) + Qalqalah + + Example with a Madd error (Lazem Madd): + >>> uth_text = "الٓمٓ" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "ءَلِف لَاااااممممِۦۦۦۦۦۦم" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... if err.ref_tajweed_rules: + ... rule = err.ref_tajweed_rules[0] + ... print(rule.name.en, err.expected_len, err.predicted_len) + Lazem Madd 6 5 + + Notes: + - The alignment is performed on the **first character** of each phoneme group. + This works because the first character is the base consonant or vowel, and the + rest of the group contains diacritics or lengthening marks. However, it means + that errors involving only the diacritics of an otherwise correctly pronounced + base will be caught in the "equal" branch (via tashkeel or tajweed checks). + - Inserted groups that have no corresponding reference are given a zero‑width + Uthmani position (start = end) based on the nearest preceding reference group. + This is a heuristic and may be refined in the future. + - The function contains TODOs for improving the precision of Uthmani positions + in insertions and for handling special phonetic marks like Imala and Sakt. + """ ref_ph_groups = chunck_phonemes(ref_ph_text) pred_ph_groups = chunck_phonemes(predicted_ph_text) diff --git a/tests/test_explain_error_api.py b/tests/test_explain_error_api.py index f536ebb..d2345a7 100644 --- a/tests/test_explain_error_api.py +++ b/tests/test_explain_error_api.py @@ -1,6 +1,9 @@ -from quran_transcript.phonetics.error_explainer import ReciterError, explain_error - -from quran_transcript import quran_phonetizer, MoshafAttributes +from quran_transcript import ( + quran_phonetizer, + MoshafAttributes, + ReciterError, + explain_error, +) if __name__ == "__main__": @@ -17,10 +20,10 @@ predicted_text = "فكالۥۥ" predicted_text = "فكۥۥلۥۥ" - uthmani_text = "ٱلْحَقُّ" - predicted_text = "ءَلحَقق" - predicted_text = "ءَلحقق" - predicted_text = "ءَلحُقق" + # uthmani_text = "ٱلْحَقُّ" + # predicted_text = "ءَلحَقق" + # predicted_text = "ءَلحقق" + # predicted_text = "ءَلحُقق" # uthmani_text = "الٓمٓ" # predicted_text = "ءَلِف لَااااااممممِۦۦۦۦۦۦم"