diff --git a/README.md b/README.md index 1cf7909..2c6d228 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,22 @@ [colab-url]: https://colab.research.google.com/drive/1d9-mVu2eiPOPS9z5sS2V4TQ579xIUBi-?usp=sharing # `quran-transcript` package +## 🆕 ما الجديد في الإصدار 0.5.1 (What's New in Version 0.5.1) + +### 🎯 تحليل أخطاء التلاوة (Recitation Error Analysis) +- إضافة دالة `explain_error` لمقارنة النص الصوتي المتوقع (المرجع) مع النص الصوتي المُتنبأ به (مثلاً من قارئ أو نموذج تعلم آلي). +- توفير تحليل تفصيلي للأخطاء يشمل: + - نوع الخطأ: تجويدي (`tajweed`)، عادي (`normal`)، أو حركات (`tashkeel`). + - نوع الخطأ الكلامي: إدراج (`insert`)، حذف (`delete`)، أو استبدال (`replace`). + - قواعد التجويد المرتبطة بالخطأ (مثل المد، القلقلة، الغنة) مع تحديد الطول المتوقع والفعلي عند الاقتضاء. +- تمثيل النتائج باستخدام كائن `ReciterError` الذي يحتوي على معلومات دقيقة عن موقع الخطأ في النص العثماني والصوتي. +- هذه الأداة مفيدة لتقييم أداء قراء القرآن، وتحليل أخطاء نماذج التعرف على الكلام، وتقديم تغذية راجعة للمتعلمين. + + + + + + ## 🆕 ما الجديد في الإصدار 0.4.0 (What's New in Version 0.4.0) @@ -253,6 +269,112 @@ print(f"النص العثماني: {uthmani_text}") - `start`: موقع بداية المطابقة - `end`: موقع نهاية المطابقة (غير شامل) +### 📖 مثال على تحليل أخطاء التلاوة (Error Analysis Example) + +```python +from quran_transcript import ( + quran_phonetizer, + MoshafAttributes, + ReciterError, + explain_error, +) + +# إعداد خصائص المصحف +moshaf = MoshafAttributes( + rewaya="hafs", + madd_monfasel_len=4, + madd_mottasel_len=4, + madd_mottasel_waqf=4, + madd_aared_len=4, +) + +# النص العثماني الأصلي +uthmani_text = "قَالُوٓا۟" + +# نص صوتي متوقع (مرجعي) +ref_out = quran_phonetizer(uthmani_text, moshaf) +print("المرجع:", ref_out.phonemes) + +# نص صوتي مُتنبأ به (به أخطاء) +predicted_text = "فكۥۥلۥۥ" +print("المتنبأ به:", predicted_text) + +# تحليل الأخطاء +errors = explain_error( + uthmani_text=uthmani_text, + ref_ph_text=ref_out.phonemes, + predicted_ph_text=predicted_text, + mappings=ref_out.mappings, +) + +# عرض النتائج +for err in errors: + print("\n" + "="*50) + print(f"الموقع في العثماني: `{uthmani_text[err.uthmani_pos[0]:err.uthmani_pos[1]]}`, {err.uthmani_pos}") + print(f"الموقع في الصوتي: `{ref_out.phonemes[err.ph_pos[0]:err.ph_pos[1]]}`, {err.ph_pos}") + print(f"نوع الخطأ: {err.error_type} - {err.speech_error_type}") + print(f"المتوقع: '{err.expected_ph}' - المُتنبأ به: '{err.preditected_ph}'") + if err.ref_tajweed_rules: + for rule in err.ref_tajweed_rules: + print(f" قاعدة تجويد مرجعية: {rule.name.ar} ({rule.name.en})") + if err.replaced_tajweed_rules: + for rule in err.replaced_tajweed_rules: + print(f" قاعدة تجويد مستبدلة: {rule.name.ar} ({rule.name.en})") + if err.missing_tajweed_rules: + for rule in err.missing_tajweed_rules: + print(f" قاعدة تجويد مفقودة: {rule.name.ar} ({rule.name.en})") +``` + +**مخرجات متوقعة (Partial output):** +``` +المرجع: قَاالُۥۥ +المتنبأ به: فكۥۥلۥۥ + +================================================== +الموقع في العثماني: ``, (0, 0) +الموقع في الصوتي: ``, (0, 0) +نوع الخطأ: normal - insert +المتوقع: '' - المُتنبأ به: 'ف' + +================================================== +الموقع في العثماني: `قَ`, (0, 2) +الموقع في الصوتي: `قَ`, (0, 2) +نوع الخطأ: normal - replace +المتوقع: 'قَ' - المُتنبأ به: 'ك' + +================================================== +الموقع في العثماني: `ا`, (2, 3) +الموقع في الصوتي: `اا`, (2, 4) +نوع الخطأ: tajweed - replace +المتوقع: 'اا' - المُتنبأ به: 'ۥۥ' + قاعدة تجويد مرجعية: المد الطبيعي (Normal Madd) + قاعدة تجويد مستبدلة: المد الطبيعي (Normal Madd) + +================================================== +الموقع في العثماني: `لُ`, (3, 5) +الموقع في الصوتي: `لُ`, (4, 6) +نوع الخطأ: tashkeel - delete +المتوقع: 'لُ' - المُتنبأ به: 'ل' +``` + +--- + +### 📦 كائنات تحليل الأخطاء (Error Analysis Dataclasses) + +#### `ReciterError` +يمثل خطأ واحد في التلاوة: +- `uthmani_pos`: tuple[int, int] – موقع الخطأ في النص العثماني (بداية، نهاية). +- `ph_pos`: tuple[int, int] – موقع الخطأ في النص الصوتي المرجعي (بداية، نهاية). +- `error_type`: Literal["tajweed", "normal", "tashkeel"] – نوع الخطأ. +- `speech_error_type`: Literal["insert", "delete", "replace"] – نوع الخطأ الكلامي. +- `expected_ph`: str – المقطع الصوتي المتوقع. +- `preditected_ph`: str – المقطع الصوتي المُتنبأ به. +- `expected_len`: Optional[int] – الطول المتوقع (لأخطاء المد مثلاً). +- `predicted_len`: Optional[int] – الطول الفعلي. +- `ref_tajweed_rules`: Optional[list[TajweedRule]] – قواعد التجويد المرتبطة بالمقطع المتوقع. +- `inserted_tajweed_rules`, `replaced_tajweed_rules`, `missing_tajweed_rules`: Optional[list[TajweedRule]] – قواعد التجويد التي تم إدراجها أو استبدالها أو فقدانها. + + ### الحروف: (43) diff --git a/pyproject.toml b/pyproject.toml index 1571a55..298216f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] license = "MIT" name = "quran-transcript" -version = "0.4.0" +version = "0.5.1" authors = [ { name="Abdullah", email="abdullahamlyossef@gmail.com" }, ] diff --git a/quran-script/ph_index.npy b/quran-script/ph_index.npy index ebef823..e88716b 100644 Binary files a/quran-script/ph_index.npy and b/quran-script/ph_index.npy differ diff --git a/src/quran_transcript/__init__.py b/src/quran_transcript/__init__.py index 740c68a..6c33af3 100644 --- a/src/quran_transcript/__init__.py +++ b/src/quran_transcript/__init__.py @@ -23,6 +23,7 @@ NoPhonemesSearchResult, PhoneticSearch, ) +from .phonetics.error_explainer import explain_error, ReciterError from . import alphabet as alphabet @@ -54,4 +55,6 @@ "PhonmesSearhResult", "NoPhonemesSearchResult", "PhoneticSearch", + "explain_error", + "ReciterError", ] diff --git a/src/quran_transcript/phonetics/conv_base_operation.py b/src/quran_transcript/phonetics/conv_base_operation.py index 2c5f719..e5b2a96 100644 --- a/src/quran_transcript/phonetics/conv_base_operation.py +++ b/src/quran_transcript/phonetics/conv_base_operation.py @@ -406,6 +406,21 @@ def get_mappings( # TODO: remove this assert all(m is not None for m in new_mappings) + # Special Case where we want to assgin the tag for Leen Madd + for m_idx in range(len(new_mappings)): + if new_mappings[m_idx].tajweed_rules: + for taj_idx in range(len(new_mappings[m_idx].tajweed_rules)): + if ( + new_mappings[m_idx].tajweed_rules[taj_idx].name.en == "Leen Madd" + and new_mappings[m_idx].tajweed_rules[taj_idx].tag is None + ): + tag = ( + new_mappings[m_idx] + .tajweed_rules[taj_idx] + ._madd_to_tag[new_text[new_mappings[m_idx].pos[0]]] + ) + new_mappings[m_idx].tajweed_rules[taj_idx].tag = tag + # Special case where we have Idgham tanween # Special sympol `tanweed_idgham_detrminer` has no meaning moving it to the tanween for re_out in re.finditer(f"{alph.uthmani.tanween_idhaam_dterminer}[^$]", text): @@ -441,6 +456,34 @@ def get_mappings( new_mappings = merge_mappings(mappings, new_mappings) + # Special case where skoon sign is repaced with qalalah sign + # We want the qalqlah sign associated with the letter it self not the + # Did not want that but no way to solve exept with this + for re_out in re.finditer( + f"[^{alph.uthmani.ras_haaa}{alph.uthmani.shadda}]({alph.phonetics.qlqla})", + new_text, + ): + qlq_idx = re_out.span(1)[0] + char_idx = qlq_idx - 1 + # getting skon or shadda idx in the merged mappings + m_idx = 0 + for m_idx in range(len(new_mappings)): + if new_mappings[m_idx].pos[0] == qlq_idx: + break + # Avodig the case where we have qalqlah at the end with no (shadda or skonJ) + if new_mappings[m_idx - 1].tajweed_rules is None: + new_mappings[m_idx - 1].pos = ( + new_mappings[m_idx - 1].pos[0], + new_mappings[m_idx].pos[1], + ) + new_mappings[m_idx - 1].tajweed_rules = new_mappings[m_idx].tajweed_rules + new_mappings[m_idx].pos = ( + new_mappings[m_idx].pos[1], + new_mappings[m_idx].pos[1], + ) + new_mappings[m_idx].deleted = True + new_mappings[m_idx].tajweed_rules = None + # TODO: remove this curr_m = None next_m = None @@ -577,7 +620,11 @@ def sub_with_mapping( @dataclass class ConversionOperation: - regs: list[tuple[str, str]] | tuple[str, str] + regs: ( + list[tuple[str, str, TajweedRule] | tuple[str, str]] + | tuple[str, str, TajweedRule] + | tuple[str, str] + ) arabic_name: str ops_before: list["ConversionOperation"] | None = None @@ -594,8 +641,18 @@ def forward( moshaf: MoshafAttributes, mappings: MappingListType | None = None, ) -> tuple[str, MappingListType]: - for input_reg, out_reg in self.regs: - text, mappings = sub_with_mapping(input_reg, out_reg, text, mappings) + for reg in self.regs: + if len(reg) == 2: + input_reg, out_reg = reg + taj_rule = None + elif len(reg) == 3: + input_reg, out_reg, taj_rule = reg + else: + raise ValueError("Invalid Input") + + text, mappings = sub_with_mapping( + input_reg, out_reg, text, mappings, tajweed_rule=taj_rule + ) return text, mappings def apply( diff --git a/src/quran_transcript/phonetics/error_explainer.py b/src/quran_transcript/phonetics/error_explainer.py index dde5b1d..3b3974a 100644 --- a/src/quran_transcript/phonetics/error_explainer.py +++ b/src/quran_transcript/phonetics/error_explainer.py @@ -9,7 +9,7 @@ from .tajweed_rulses import TajweedRule, NormalMaddRule -from .conv_base_operation import MappingPos +from .conv_base_operation import MappingPos, MappingListType @dataclass @@ -22,8 +22,10 @@ class ReciterError: preditected_ph: str expected_len: Optional[int] | None = None predicted_len: Optional[int] | None = None - tajweed_rules: Optional[list[TajweedRule]] | None = None - predicted_tajweed_rules: Optional[list[TajweedRule]] | None = None + ref_tajweed_rules: Optional[list[TajweedRule]] | None = None + inserted_tajweed_rules: Optional[list[TajweedRule]] | None = None + replaced_tajweed_rules: Optional[list[TajweedRule]] | None = None + missing_tajweed_rules: Optional[list[TajweedRule]] | None = None @dataclass @@ -77,48 +79,161 @@ def align_phonemes_groups( def extract_ref_phonetic_to_uthmani( - mappings: list[MappingPos | None], + mappings: MappingListType, ) -> dict[int, int]: ref_ph_to_uthmani = {} for idx, map_pos in enumerate(mappings): - if map_pos is not None: - for ph_idx in range(*map_pos.pos): - if ph_idx in ref_ph_to_uthmani: - raise ValueError( - f"Same phonetic scripts has multiple uthmani script. Phonetic posision: `{ph_idx}`, Uthmani Poses: `{ref_ph_to_uthmani[ph_idx]}, {idx}`" - ) - else: - ref_ph_to_uthmani[ph_idx] = idx + for ph_idx in range(*map_pos.pos): + if ph_idx in ref_ph_to_uthmani: + raise ValueError( + f"Same phonetic scripts has multiple uthmani script. Phonetic posision: `{ph_idx}`, Uthmani Poses: `{ref_ph_to_uthmani[ph_idx]}, {idx}`" + ) + else: + ref_ph_to_uthmani[ph_idx] = idx return ref_ph_to_uthmani def get_ref_phonetic_groups_tajweed_rules( ref_ph_groups: list[str], - mappings: list[MappingPos | None], + mappings: MappingListType, ref_ph_to_uthmani: dict[int, int], -) -> list[None | list[TajweedRule]]: - ref_tajweed_rules = [None] * len(ref_ph_groups) +) -> list[list[TajweedRule]]: + ref_tajweed_rules: list[TajweedRule] = [[] for _ in range(len(ref_ph_groups))] start = 0 end = 0 - # Computing Tajweed rules - # TODO: O(n^2) too bad should be O(n) + # Computing Tajweed rules O(len(ref_ph_text)) for ph_g_idx, ph_g in enumerate(ref_ph_groups): end += len(ph_g) - for map_pos in mappings: - if map_pos is not None: - if start >= map_pos.pos[0] and end <= map_pos.pos[1]: - if ref_tajweed_rules[ph_g_idx] is None: - ref_tajweed_rules[ph_g_idx] = map_pos.tajweed_rules - else: - ref_tajweed_rules[ph_g_idx] += map_pos.tajweed_rules + used_uth_ids = set() + for ph_idx in range(start, end): + uth_idx = ref_ph_to_uthmani[ph_idx] + if uth_idx not in used_uth_ids: + used_uth_ids.add(uth_idx) + if mappings[uth_idx].tajweed_rules: + ref_tajweed_rules[ph_g_idx].extend(mappings[uth_idx].tajweed_rules) start = end return ref_tajweed_rules +def get_tasshkeel_error( + ref_ph: str, + pred_ph: str, + uthmani_pos: tuple[int, int], + ph_pos: tuple[int, int], +) -> ReciterError: + if len(pred_ph) > len(ref_ph): + sp_tp = "insert" + elif len(pred_ph) < len(ref_ph): + sp_tp = "delete" + else: + sp_tp = "replace" + + err = ReciterError( + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + error_type="tashkeel", + speech_error_type=sp_tp, + expected_ph=ref_ph, + preditected_ph=pred_ph, + ) + return err + + def explain_error( - uthmani_text, ref_ph_text, predicted_ph_text, mappings: list[MappingPos | None] + uthmani_text: str, + ref_ph_text: str, + predicted_ph_text: str, + mappings: MappingListType, ) -> list[ReciterError]: - """ """ + """Explain errors in a predicted phonetic transcription compared to the reference. + + This function performs a detailed alignment between the reference (correct) phonetic + transcription and a predicted transcription (e.g., from a speech recognition system + or a learner's recitation). It breaks both strings into phoneme groups (using + `chunck_phonemes`) and aligns them using Levenshtein opcodes on the first character + of each group. For each aligned group, it checks for: + - Insertions, deletions, or substitutions of whole groups. + - Tajweed rule violations (e.g., incorrect Madd length, missing Qalqalah or Ghonnah). + - Mismatches in short vowels (harakat) or other diacritics. + - Special phonetic marks (Imala, Sakt, etc.) – currently placeholders. + + The function uses the provided `mappings` to locate each error in the original + Uthmani text and to associate Tajweed rules with reference phoneme groups. + The result is a list of `ReciterError` objects that can be used for feedback, + error analysis, or pronunciation training. + + Args: + uthmani_text: The original Uthmani script text (used to locate the error source). + ref_ph_text: The reference phonetic string (correct recitation), as produced by + `quran_phonetizer` or a similar function. + predicted_ph_text: The predicted phonetic string to be evaluated. + mappings: A list of `MappingPos` objects that link each Uthmani character to its + corresponding range(s) in the reference phonetic string. This mapping + must cover the entire `ref_ph_text` and be consistent (no phonetic index + maps to two different Uthmani indices). + + Returns: + A list of `ReciterError` dataclass instances, each describing a single error. + The list is ordered by the occurrence of errors along the phonetic sequence. + Each error contains: + - Uthmani and phonetic positions (start, end) where the error occurs. + - Error type: "tajweed", "normal", or "tashkeel". + - Speech error type: "insert", "delete", or "replace". + - Expected and predicted phonetic substrings. + - For tajweed errors: expected/predicted lengths (if applicable) and the + relevant Tajweed rules (reference, replaced, missing). + Errors are not merged; every mismatch in a phoneme group produces at least one error. + + Raises: + ValueError: If the same phonetic index is mapped to multiple Uthmani indices + (inconsistent mapping) or if an unsupported `correctness_type` is + encountered in a Tajweed rule. + + Examples: + Basic usage with a single word: + >>> moshaf = MoshafAttributes(...) + >>> uth_text = "قَالُوٓا۟" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "كالۥۥ" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... print(err.error_type, err.speech_error_type, err.expected_ph, err.preditected_ph) + tajweed replace اااااا ۥۥ + tashkeel delete لُ ل + + Example showing a missing Qalqalah: + >>> uth_text = "ٱلْحَقُّ" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "ءَلحقق" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... if err.error_type == 'tajweed' and err.ref_tajweed_rules: + ... print(err.ref_tajweed_rules[0].name.en) + Qalqalah + + Example with a Madd error (Lazem Madd): + >>> uth_text = "الٓمٓ" + >>> ref_out = quran_phonetizer(uth_text, moshaf) + >>> pred_text = "ءَلِف لَاااااممممِۦۦۦۦۦۦم" + >>> errors = explain_error(uth_text, ref_out.phonemes, pred_text, ref_out.mappings) + >>> for err in errors: + ... if err.ref_tajweed_rules: + ... rule = err.ref_tajweed_rules[0] + ... print(rule.name.en, err.expected_len, err.predicted_len) + Lazem Madd 6 5 + + Notes: + - The alignment is performed on the **first character** of each phoneme group. + This works because the first character is the base consonant or vowel, and the + rest of the group contains diacritics or lengthening marks. However, it means + that errors involving only the diacritics of an otherwise correctly pronounced + base will be caught in the "equal" branch (via tashkeel or tajweed checks). + - Inserted groups that have no corresponding reference are given a zero‑width + Uthmani position (start = end) based on the nearest preceding reference group. + This is a heuristic and may be refined in the future. + - The function contains TODOs for improving the precision of Uthmani positions + in insertions and for handling special phonetic marks like Imala and Sakt. + """ ref_ph_groups = chunck_phonemes(ref_ph_text) pred_ph_groups = chunck_phonemes(predicted_ph_text) @@ -130,16 +245,22 @@ def explain_error( ) # Aligning Phonemes groups using first chat of every one - alignmets = align_phonemes_groups(ref_ph_groups, pred_ph_groups) + alignments = align_phonemes_groups(ref_ph_groups, pred_ph_groups) errors = [] pred_ph_start = 0 ref_ph_start = 0 pred_ph_end = 0 ref_ph_end = 0 - for align in alignmets: - ref_ph = ref_ph_groups[align.ref_idx] - pred_ph = pred_ph_groups[align.pred_idx] + for align in alignments: + if align.ref_idx < len(ref_ph_groups): + ref_ph = ref_ph_groups[align.ref_idx] + else: + ref_ph = "" + if align.pred_idx < len(pred_ph_groups): + pred_ph = pred_ph_groups[align.pred_idx] + else: + pred_ph = "" pred_ph_end = pred_ph_start + len(pred_ph) if align.op_type != "insert": @@ -150,6 +271,7 @@ def explain_error( ) ph_pos = (ref_ph_start, ref_ph_end) else: + # TODO: Make the uthmani posision more precise. Now we bound it to the aligments uthmani_pos = ( ref_ph_to_uthmani[ref_ph_start], ref_ph_to_uthmani[ref_ph_start], @@ -171,7 +293,7 @@ def explain_error( elif align.op_type == "replace": pred_rules = [] - if ref_ph_groups_tajweed_rules[align.ref_idx] is not None: + if ref_ph_groups_tajweed_rules[align.ref_idx]: for taj_rule in ref_ph_groups_tajweed_rules[align.ref_idx]: pred_taj_rule = taj_rule.get_relvant_rule(pred_ph) if pred_taj_rule is not None: @@ -192,8 +314,8 @@ def explain_error( preditected_ph=pred_ph, expected_len=ref_len, predicted_len=pred_len, - tajweed_rules=[taj_rule], - predicted_tajweed_rules=[pred_taj_rule], + ref_tajweed_rules=[taj_rule], + replaced_tajweed_rules=[pred_taj_rule], ) ) else: @@ -205,7 +327,7 @@ def explain_error( speech_error_type="replace", expected_ph=ref_ph, preditected_ph=pred_ph, - tajweed_rules=[taj_rule], + ref_tajweed_rules=[taj_rule], ) ) @@ -239,16 +361,17 @@ def explain_error( if ref_ph == pred_ph: ... # We have Tajweed rule - elif ref_ph_groups_tajweed_rules[align.ref_idx] is not None: + elif ref_ph_groups_tajweed_rules[align.ref_idx]: for taj_rule in ref_ph_groups_tajweed_rules[align.ref_idx]: exp_len = None pred_len = None + missing_taj_rules = None if taj_rule.correctness_type == "count": pred_len = taj_rule.count(ref_ph, pred_ph) exp_len = taj_rule.golden_len - # TODO: What to do with `match` elif taj_rule.correctness_type == "match": - ... + if not taj_rule.match(ref_ph, pred_ph): + missing_taj_rules = [taj_rule] else: raise ValueError( f"Invalid mathing type: `{taj_rule.correctness_type}`. Available: `match`, `count`" @@ -263,14 +386,33 @@ def explain_error( preditected_ph=pred_ph, expected_len=exp_len, predicted_len=pred_len, - tajweed_rules=[taj_rule], + ref_tajweed_rules=[taj_rule], + missing_tajweed_rules=missing_taj_rules, ) ) + if ( + ref_ph[-1] in alph.phonetic_groups.harakat + and pred_ph[-1] != ref_ph[-1] + ): + errors.append( + get_tasshkeel_error( + ref_ph=ref_ph, + pred_ph=pred_ph, + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + ) + ) # Tashkeel (Harakat) - # TODO: - elif ref_ph_groups[align.ref_idx][-1] in alph.phonetic_groups.harakat: - ... + elif ref_ph[-1] in alph.phonetic_groups.harakat: + errors.append( + get_tasshkeel_error( + ref_ph=ref_ph, + pred_ph=pred_ph, + uthmani_pos=uthmani_pos, + ph_pos=ph_pos, + ) + ) # TODO: imala, sakt, dammao momala elif ref_ph_groups[align.ref_idx][-1] in alph.phonetic_groups.residuals: @@ -283,30 +425,3 @@ def explain_error( ref_ph_start = ref_ph_end return errors - - -if __name__ == "__main__": - uthmani_text = "قالوا" - ph_text = "قاالۥۥ" - predicted_text = "كالۥۥ" - predicted_text = "فكالۥۥ" - predicted_text = "فكۥۥلۥۥ" - - normal_madd_alif = NormalMaddRule(tag="alif") - normal_madd_waw = NormalMaddRule(tag="waw") - - mapping = [ - MappingPos(pos=(0, 1)), - MappingPos(pos=(1, 3), tajweed_rules=[normal_madd_alif]), - MappingPos(pos=(3, 4)), - MappingPos(pos=(4, 6), tajweed_rules=[normal_madd_waw]), - None, - ] - errors = explain_error( - uthmani_text=uthmani_text, - ref_ph_text=ph_text, - predicted_ph_text=predicted_text, - mappings=mapping, - ) - for err in errors: - print(err) diff --git a/src/quran_transcript/phonetics/operations.py b/src/quran_transcript/phonetics/operations.py index fbf5d57..9dc0363 100644 --- a/src/quran_transcript/phonetics/operations.py +++ b/src/quran_transcript/phonetics/operations.py @@ -12,7 +12,18 @@ from .moshaf_attributes import MoshafAttributes from ..alphabet import uthmani as uth from ..alphabet import phonetics as ph -from .tajweed_rulses import NormalMaddRule +from .tajweed_rulses import ( + TajweedRule, + Qalqalah, + NormalMaddRule, + MonfaselMaddRule, + MottaselMaddPauseRule, + MottaselMaddRule, + LazemMaddRule, + AaredMaddRule, + LeenMaddRule, + IdghamKamel, +) @dataclass @@ -271,7 +282,7 @@ class BeginWithSaken(ConversionOperation): @dataclass class ConvertAlifMaksora(ConversionOperation): arabic_name: str = "تحويل الأف المقصورة إله: حضف أو ألف أو ياء" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # حذف الأف المقصورة من الاسم المقصور النكرة ( @@ -362,7 +373,7 @@ class RemoveSkoonMostadeer(ConversionOperation): @dataclass class SkoonMostateel(ConversionOperation): arabic_name: str = "ضبط السكون المستطيل" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # remove from the middle ( @@ -381,7 +392,7 @@ class SkoonMostateel(ConversionOperation): @dataclass class MaddAlewad(ConversionOperation): arabic_name: str = "ضبط مد العوض وسطا ووقفا" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # remove from the middle ( @@ -411,7 +422,7 @@ class EnlargeSmallLetters(ConversionOperation): arabic_name: str = ( "تكبير الألف والياء والاو والنون الصغار مع حذف مد الصلة عند الوقف" ) - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # small alif ( @@ -479,7 +490,7 @@ class NormalizeTaa(ConversionOperation): ] ) arabic_name: str = "تحويب التاء المربطة في الوسط لتاء وفي الآخر لهاء" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ (f"{uth.taa_marboota}$", f"{uth.haa}"), (f"{uth.taa_marboota}", f"{uth.taa_mabsoota}"), @@ -514,7 +525,7 @@ class PrepareGhonnaIdghamIqlab(ConversionOperation): ] ) arabic_name: str = "فك الإقلاب والعغنة الإدغام" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # النون المقلبة ميمام ( @@ -578,6 +589,7 @@ class PrepareGhonnaIdghamIqlab(ConversionOperation): ( f"([{uth.fatha}{uth.dama}]{uth.yaa}|[{uth.fatha}{uth.kasra}]{uth.waw}|[{uth.pure_letters_without_yaa_and_waw_group}]){uth.space}?([{uth.pure_letters_group}]{uth.shadda})", r"\2", + # IdghamKamel(), ), ] ) @@ -591,7 +603,7 @@ class IltiqaaAlsaknan(ConversionOperation): ] ) arabic_name: str = "التقاء الساكنان وكسر التنوين" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str] | tuple[str, str, TajweedRule]] = field( default_factory=lambda: [ # كسر التنوين ( @@ -769,6 +781,7 @@ def forward( r"\1" + ph.alif * moshaf.madd_monfasel_len + r"\2", text, mappings, + MonfaselMaddRule(golden_len=moshaf.madd_monfasel_len, tag="alif"), ) # normal for k, madd_patt in self.madd_map.items(): @@ -777,6 +790,9 @@ def forward( r"\1" + moshaf.madd_monfasel_len * madd_patt.target + r"\2", text, mappings, + MonfaselMaddRule( + golden_len=moshaf.madd_monfasel_len, tag=madd_patt.name + ), ) # المد المتصل وقفا @@ -790,6 +806,10 @@ def forward( + r"\2", text, mappings, + MottaselMaddPauseRule( + golden_len=max(moshaf.madd_mottasel_waqf, moshaf.madd_aared_len), + tag=madd_patt.name, + ), ) # المد المنفصل @@ -799,6 +819,9 @@ def forward( r"\1" + moshaf.madd_mottasel_len * madd_patt.target + r"\2", text, mappings, + MottaselMaddRule( + golden_len=moshaf.madd_mottasel_len, tag=madd_patt.name + ), ) # المد اللازم @@ -808,10 +831,13 @@ def forward( r"\1" + (moshaf.madd_yaa_alayn_alharfy - 1) * ph.yaa, text, mappings, + LeenMaddRule(golden_len=moshaf.madd_yaa_alayn_alharfy, tag="yaa"), ) # ميم آل عمران + meem_aal_imran_taj_rule = LazemMaddRule(tag="alif") if moshaf.meem_aal_imran == "wasl_2": meema_len = 2 + meem_aal_imran_taj_rule = NormalMaddRule(tag="alif") elif moshaf.meem_aal_imran == "wasl_6": meema_len = 6 else: @@ -821,6 +847,7 @@ def forward( r"\1" + ph.yaa_madd * meema_len + r"\2", text, mappings, + meem_aal_imran_taj_rule, ) for k, madd_patt in self.madd_map.items(): @@ -829,6 +856,7 @@ def forward( r"\1" + 6 * madd_patt.target + r"\2", text, mappings, + LazemMaddRule(tag=madd_patt.name), ) # المد العارض للسكون @@ -838,6 +866,7 @@ def forward( r"\1" + moshaf.madd_aared_len * madd_patt.target + r"\2", text, mappings, + AaredMaddRule(golden_len=moshaf.madd_aared_len, tag=madd_patt.name), ) # مد اللين @@ -846,6 +875,7 @@ def forward( r"\1" + (moshaf.madd_alleen_len - 1) * r"\2" + r"\3", text, mappings, + LeenMaddRule(golden_len=moshaf.madd_alleen_len), ) # المد الطبيعي @@ -864,9 +894,10 @@ def forward( @dataclass class Qalqla(ConversionOperation): arabic_name: str = "إضافة علامة القلقة" - regs: tuple[str, str] = ( + regs: tuple[str, str, TajweedRule] = ( f"([{uth.qlqla_group}](?:{uth.shadda}$|{uth.ras_haaa}|$))", r"\1" + ph.qlqla, + Qalqalah(), ) ops_before: list[ConversionOperation] = field( default_factory=lambda: [ @@ -878,7 +909,7 @@ class Qalqla(ConversionOperation): @dataclass class RemoveRasHaaAndShadda(ConversionOperation): arabic_name: str = "حذف السكون والشدة م تكرار الحرف المشدد" - regs: list[tuple[str, str]] = field( + regs: list[tuple[str, str, TajweedRule] | tuple[str, str]] = field( default_factory=lambda: [ # shadda ( diff --git a/src/quran_transcript/phonetics/tajweed_rulses.py b/src/quran_transcript/phonetics/tajweed_rulses.py index 4b36822..d3f3026 100644 --- a/src/quran_transcript/phonetics/tajweed_rulses.py +++ b/src/quran_transcript/phonetics/tajweed_rulses.py @@ -54,13 +54,21 @@ class Qalqalah(TajweedRule): golden_len: int = 0 correctness_type: Literal["match", "count"] = "match" + def match(self, ref_text, pred_text) -> bool: + return ref_text == pred_text + def is_ph_str_in(self, ph_str: str) -> bool: """Whether the phonetic script is assoicated with this Tajweed rule or not""" return True def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: """Returs a Tajweed rule that is assocaited with the input ph_str""" - return self + if not ph_str: + return None + elif ph_str[-1] == alph.phonetics.qlqla: + return self + else: + return None @dataclass @@ -88,14 +96,14 @@ def count(self, ref_text, pred_text) -> int: def is_ph_str_in(self, ph_str: str) -> bool: """Whether the phonetic script is assoicated with this Tajweed rule or not""" if ph_str: - return ph_str[0] in self._madd_to_tags + return ph_str[0] in self._madd_to_tag else: return False def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: """Returs a Tajweed rule that is assocaited with the input ph_str""" if not ph_str: - raise ValueError("Empty String") + return None elif ph_str[0] not in self._madd_to_tag: return None return replace(self, tag=self._madd_to_tag[ph_str[0]]) @@ -109,6 +117,186 @@ class NormalMaddRule(MaddRule): golden_len: int = 2 +@dataclass +class MonfaselMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد المنفصل", en="Monfasel Madd") + ) + golden_len: int = 4 + + +@dataclass +class MottaselMaddPauseRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName( + ar="المد المتصل وقفا", en="Mottasel Madd at Pause" + ) + ) + golden_len: int = 4 + + +@dataclass +class MottaselMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد المتصل", en="Mottasel Madd") + ) + golden_len: int = 4 + + +@dataclass +class LazemMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد اللازم", en="Lazem Madd") + ) + golden_len: int = 6 + + +@dataclass +class AaredMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="المد العارض للسكون", en="Aared Madd") + ) + golden_len: int = 4 + + +@dataclass +class LeenMaddRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName(ar="مد اللين", en="Leen Madd") + ) + golden_len: int = 4 + + def __post_init__(self): + self.available_tags = {"waw", "yaa"} + super().__post_init__() + self._madd_to_tag = { + alph.phonetics.waw: "waw", + alph.phonetics.yaa: "yaa", + } + + def count(self, ref_text, pred_text) -> int: + # The case where we have Tashkeel after madd (Error from the model) + if pred_text[-1] != pred_text[0]: + return pred_text[:-1].count(ref_text[0]) + 1 + else: + return pred_text.count(ref_text[0]) + 1 + + +@dataclass +class IdghamKamel(TajweedRule): + name: LangName = field( + default_factory=lambda: LangName(ar="إدغام كامل", en="Full Merging") + ) + golden_len: int = 0 + correctness_type: Literal["match", "count"] = "match" + + def match(self, ref_text, pred_text) -> bool: + return ref_text == pred_text + + def is_ph_str_in(self, ph_str: str) -> bool: + """Whether the phonetic script is assoicated with this Tajweed rule or not""" + return True + + def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: + """Returs a Tajweed rule that is assocaited with the input ph_str""" + return None + + +@dataclass +class GhonnahMetadata: + name: LangName + tag: str + offset: int = 0 + + +@dataclass +class Ghonnah(TajweedRule): + name: LangName + golden_len: int = 4 + correctness_type: Literal["match", "count"] = "count" + offset: int = 0 + + def __post_init__(self): + self.available_tags = { + "noon", + "noon_yaa", + "noon_waw", + "noon_mokhfah", + "meem", + "meem_mokhfah", + } + super().__post_init__() + self._ph_to_metadata = { + alph.phonetics.noon: GhonnahMetadata( + name=field( + default_factory=lambda: LangName( + ar="النون المشددة أو المدغمة", en="Moshadad or Modgham Noon" + ) + ), + tag="noon", + offset=0, + ), + alph.phonetics.yaa: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_yaa", + offset=1, + ), + alph.phonetics.waw: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_waw", + offset=1, + ), + alph.phonetics.noon_mokhfah: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="noon_mokhfah", + offset=1, + ), + alph.phonetics.meem: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="meem", + offset=0, + ), + alph.phonetics.meem_mokhfah: GhonnahMetadata( + name=field(default_factory=lambda: LangName(ar="", en="")), + tag="meem_mokhfah", + offset=0, + ), + } + + def count(self, ref_text, pred_text) -> int: + return pred_text.count(ref_text[0]) + self.offset + + def is_ph_str_in(self, ph_str: str) -> bool: + """Whether the phonetic script is assoicated with this Tajweed rule or not""" + if ph_str: + return ph_str[0] in self._ph_to_metadata + else: + return False + + def get_relvant_rule(self, ph_str: str) -> Optional["TajweedRule"]: + """Returs a Tajweed rule that is assocaited with the input ph_str""" + if not ph_str: + return None + elif ph_str[0] not in self._ph_to_metadata: + return None + return replace( + self, + name=self._ph_to_metadata[ph_str[0]].name, + offset=self._ph_to_metadata[ph_str[0]].offset, + tag=self._ph_to_metadata[ph_str[0]].tag, + ) + + +@dataclass +class MoshaddadOrModghamNoonRule(MaddRule): + name: LangName = field( + default_factory=lambda: LangName( + ar="النون المشددة أو المدغمة", en="Moshaddad or ModghamNoon" + ) + ) + golden_len: int = 4 + + # TODO: """ diff --git a/tests/test_explain_error_api.py b/tests/test_explain_error_api.py index 1992555..d2345a7 100644 --- a/tests/test_explain_error_api.py +++ b/tests/test_explain_error_api.py @@ -1,11 +1,12 @@ -from quran_transcript.phonetics.error_explainer import ReciterError, explain_error - -from quran_transcript import quran_phonetizer, MoshafAttributes +from quran_transcript import ( + quran_phonetizer, + MoshafAttributes, + ReciterError, + explain_error, +) if __name__ == "__main__": - uthmani_text = "قَالُوٓا۟" - moshaf = MoshafAttributes( rewaya="hafs", madd_monfasel_len=4, @@ -13,12 +14,24 @@ madd_mottasel_waqf=4, madd_aared_len=4, ) - ref_ph_out = quran_phonetizer(uthmani_text, moshaf) + uthmani_text = "قَالُوٓا۟" predicted_text = "كالۥۥ" predicted_text = "فكالۥۥ" predicted_text = "فكۥۥلۥۥ" + # uthmani_text = "ٱلْحَقُّ" + # predicted_text = "ءَلحَقق" + # predicted_text = "ءَلحقق" + # predicted_text = "ءَلحُقق" + + # uthmani_text = "الٓمٓ" + # predicted_text = "ءَلِف لَااااااممممِۦۦۦۦۦۦم" + # predicted_text = "ءَلِف لَاااااممممِۦۦۦۦۦۦم" + + ref_ph_out = quran_phonetizer(uthmani_text, moshaf) + print(ref_ph_out.phonemes) + print(predicted_text) errors = explain_error( uthmani_text=uthmani_text, ref_ph_text=ref_ph_out.phonemes, @@ -26,4 +39,11 @@ mappings=ref_ph_out.mappings, ) for err in errors: + print( + f"UTH: `{uthmani_text[err.uthmani_pos[0] : err.uthmani_pos[1]]}`, {err.uthmani_pos}" + ) + print( + f"PH: `{ref_ph_out.phonemes[err.ph_pos[0] : err.ph_pos[1]]}`, {err.ph_pos}" + ) print(err) + print("-" * 50) diff --git a/tests/test_sub_with_mapping.py b/tests/test_sub_with_mapping.py index 5b35970..202e1aa 100644 --- a/tests/test_sub_with_mapping.py +++ b/tests/test_sub_with_mapping.py @@ -13,16 +13,18 @@ ) aya = Aya() aya = Aya(1, 1) - # aya = Aya(12, 1) - aya = Aya(2, 1) - aya = Aya(19, 1) + aya = Aya(12, 1) + # aya = Aya(2, 1) + # aya = Aya(19, 1) # aya = Aya(75, 27) - aya = Aya(2, 6) - aya = Aya(2, 7) - aya = Aya(27, 62) + # aya = Aya(2, 6) + # aya = Aya(2, 7) + # aya = Aya(27, 62) + # aya = Aya(112, 3) # aya = Aya(3, 1) # aya = Aya(30, 28) # aya = Aya(2, 9) + # aya = Aya(106, 1) uth_text = aya.get().uthmani # uth_text = aya.get_by_imlaey_words(start=7, window=2).uthmani @@ -30,15 +32,18 @@ # uth_text = "غِشَـٰوَةٌۭ وَلَهُمْ" # uth_text = "قَلِيلًۭا مِّمَّا" # uth_text = "أَمَّن يُجِيبُ" + # uth_text = "قَرِيبٌ" + # uth_text = "ٱلْحَقُّ" profiler = Profiler() profiler.start() - ph_out = quran_phonetizer(uth_text, moshaf, remove_spaces=True) + ph_out = quran_phonetizer(uth_text, moshaf) profiler.stop() ph_text = ph_out.phonemes print(uth_text) print(ph_out.phonemes) print(ph_out.mappings) + print("*" * 40) for idx, uth_c in enumerate(uth_text): print(f"UTH_IDX: `{idx}`, SPAN: `{ph_out.mappings[idx]}`") ph_c = "" diff --git a/tests/test_sub_with_mapping_pytest.py b/tests/test_sub_with_mapping_pytest.py index 1a9e903..c36a83b 100644 --- a/tests/test_sub_with_mapping_pytest.py +++ b/tests/test_sub_with_mapping_pytest.py @@ -17,7 +17,14 @@ clean_uthmani_spaces, ) -from quran_transcript.phonetics.tajweed_rulses import NormalMaddRule, Qalqalah +from quran_transcript.phonetics.tajweed_rulses import ( + NormalMaddRule, + Qalqalah, + LeenMaddRule, + AaredMaddRule, + LazemMaddRule, + MottaselMaddRule, +) # Import the sub_with_mapping function from the existing test file @@ -639,7 +646,14 @@ def test_merge_mappings_complex_range(self): "ءَلِف لَاااااام رَاا تِلكَ ءَاايَااتُ لكِتَاابِ لمُبِۦۦۦۦن", [ MappingPos(pos=(0, 6), tajweed_rules=None), - MappingPos(pos=(6, 16), tajweed_rules=None), + MappingPos( + pos=(6, 16), + tajweed_rules=[ + LazemMaddRule( + tag="alif", + ) + ], + ), MappingPos(pos=(16, 16), deleted=True), MappingPos( pos=(16, 20), @@ -707,7 +721,15 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(50, 51), tajweed_rules=None), MappingPos(pos=(51, 52), tajweed_rules=None), MappingPos(pos=(52, 53), tajweed_rules=None), - MappingPos(pos=(53, 57), tajweed_rules=None), + MappingPos( + pos=(53, 57), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="yaa", + ) + ], + ), MappingPos(pos=(57, 58), tajweed_rules=None), MappingPos(pos=(58, 58), deleted=True), ], @@ -752,8 +774,16 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(27, 28), tajweed_rules=None), MappingPos(pos=(28, 29), tajweed_rules=None), MappingPos(pos=(29, 30), tajweed_rules=None), - MappingPos(pos=(30, 34), tajweed_rules=None), - MappingPos(pos=(34, 35), tajweed_rules=None), + MappingPos( + pos=(30, 34), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="yaa", + ) + ], + ), + MappingPos(pos=(34, 35), deleted=False), MappingPos(pos=(35, 35), deleted=True), ], ), @@ -805,7 +835,10 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(28, 29), tajweed_rules=None), MappingPos(pos=(29, 30), tajweed_rules=None), MappingPos(pos=(30, 31), tajweed_rules=None), - MappingPos(pos=(31, 35), tajweed_rules=None), + MappingPos( + pos=(31, 35), + tajweed_rules=[MottaselMaddRule(golden_len=4, tag="alif")], + ), MappingPos(pos=(35, 35), deleted=True), MappingPos(pos=(35, 36), tajweed_rules=None), MappingPos(pos=(36, 38), tajweed_rules=None), @@ -878,7 +911,15 @@ def test_merge_mappings_complex_range(self): MappingPos(pos=(93, 94), tajweed_rules=None), MappingPos(pos=(94, 95), tajweed_rules=None), MappingPos(pos=(95, 96), tajweed_rules=None), - MappingPos(pos=(96, 100), tajweed_rules=None), + MappingPos( + pos=(96, 100), + tajweed_rules=[ + AaredMaddRule( + golden_len=4, + tag="waw", + ) + ], + ), MappingPos(pos=(100, 101), tajweed_rules=None), MappingPos(pos=(101, 101), deleted=True), ], @@ -919,9 +960,16 @@ def test_merge_mappings_complex_range(self): "ءَلِف لَااااااممممِۦۦۦۦۦۦم", [ MappingPos(pos=(0, 6), tajweed_rules=None, deleted=False), - MappingPos(pos=(6, 14), tajweed_rules=None, deleted=False), + MappingPos( + pos=(6, 14), + tajweed_rules=[LazemMaddRule(tag="alif")], + deleted=False, + ), MappingPos(pos=(14, 14), tajweed_rules=None, deleted=True), - MappingPos(pos=(14, 26), tajweed_rules=None, deleted=False), + MappingPos( + pos=(14, 26), + tajweed_rules=[LazemMaddRule(tag="yaa")], + ), MappingPos(pos=(26, 26), tajweed_rules=None, deleted=True), ], ), @@ -994,6 +1042,90 @@ def test_merge_mappings_complex_range(self): ), ], ), + ( + "لَمْ يَلِدْ وَلَمْ يُولَدْ", + "لَم يَلِدڇ وَلَم يُۥۥلَدڇ", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 3), tajweed_rules=None, deleted=True), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 5), tajweed_rules=None, deleted=False), + MappingPos(pos=(5, 6), tajweed_rules=None, deleted=False), + MappingPos(pos=(6, 7), tajweed_rules=None, deleted=False), + MappingPos(pos=(7, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(8, 10), tajweed_rules=[Qalqalah()]), + MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), + MappingPos(pos=(10, 11), tajweed_rules=None, deleted=False), + MappingPos(pos=(11, 12), tajweed_rules=None, deleted=False), + MappingPos(pos=(12, 13), tajweed_rules=None, deleted=False), + MappingPos(pos=(13, 14), tajweed_rules=None, deleted=False), + MappingPos(pos=(14, 15), tajweed_rules=None, deleted=False), + MappingPos(pos=(15, 16), tajweed_rules=None, deleted=False), + MappingPos(pos=(16, 16), tajweed_rules=None, deleted=True), + MappingPos(pos=(16, 17), tajweed_rules=None, deleted=False), + MappingPos(pos=(17, 18), tajweed_rules=None, deleted=False), + MappingPos(pos=(18, 19), tajweed_rules=None, deleted=False), + MappingPos( + pos=(19, 21), + tajweed_rules=[ + NormalMaddRule( + tag="waw", + ) + ], + ), + MappingPos(pos=(21, 22), tajweed_rules=None, deleted=False), + MappingPos(pos=(22, 23), tajweed_rules=None, deleted=False), + MappingPos( + pos=(23, 25), + tajweed_rules=[Qalqalah()], + ), + MappingPos(pos=(25, 25), tajweed_rules=None, deleted=True), + ], + ), + ( + "قَرِيبٌ", + "قَرِۦۦۦۦبڇ", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 8), tajweed_rules=[AaredMaddRule(tag="yaa")]), + MappingPos(pos=(8, 10), tajweed_rules=[Qalqalah()]), + MappingPos(pos=(10, 10), tajweed_rules=None, deleted=True), + ], + ), + ( + "لِإِيلَـٰفِ قُرَيْشٍ", + "لِءِۦۦلَاافِ قُرَيييش", + [ + MappingPos(pos=(0, 1), tajweed_rules=None, deleted=False), + MappingPos(pos=(1, 2), tajweed_rules=None, deleted=False), + MappingPos(pos=(2, 3), tajweed_rules=None, deleted=False), + MappingPos(pos=(3, 4), tajweed_rules=None, deleted=False), + MappingPos(pos=(4, 6), tajweed_rules=[NormalMaddRule(tag="yaa")]), + MappingPos(pos=(6, 7), tajweed_rules=None, deleted=False), + MappingPos(pos=(7, 8), tajweed_rules=None, deleted=False), + MappingPos(pos=(8, 8), tajweed_rules=None, deleted=True), + MappingPos(pos=(8, 10), tajweed_rules=[NormalMaddRule(tag="alif")]), + MappingPos(pos=(10, 11), tajweed_rules=None, deleted=False), + MappingPos(pos=(11, 12), tajweed_rules=None, deleted=False), + MappingPos(pos=(12, 13), tajweed_rules=None, deleted=False), + MappingPos(pos=(13, 14), tajweed_rules=None, deleted=False), + MappingPos(pos=(14, 15), tajweed_rules=None, deleted=False), + MappingPos(pos=(15, 16), tajweed_rules=None, deleted=False), + MappingPos(pos=(16, 17), tajweed_rules=None, deleted=False), + MappingPos( + pos=(17, 20), + tajweed_rules=[LeenMaddRule(golden_len=4, tag="yaa")], + ), + MappingPos(pos=(20, 20), tajweed_rules=None, deleted=True), + MappingPos(pos=(20, 21), tajweed_rules=None, deleted=False), + MappingPos(pos=(21, 21), tajweed_rules=None, deleted=True), + ], + ), ], ) def test_phonetizer_with_mappings( diff --git a/uv.lock b/uv.lock index 67decf0..f7d108f 100644 --- a/uv.lock +++ b/uv.lock @@ -557,7 +557,7 @@ wheels = [ [[package]] name = "quran-transcript" -version = "0.4.0" +version = "0.5.1" source = { editable = "." } dependencies = [ { name = "fuzzysearch" },