From 03b632ca5835e2d564e24875bee5be45489fb886 Mon Sep 17 00:00:00 2001 From: Guacanole01 <98541203+Guacanole01@users.noreply.github.com> Date: Tue, 11 Mar 2025 03:45:48 +0000 Subject: [PATCH] Complete files and tokenization hw --- lab.ipynb | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 8 deletions(-) diff --git a/lab.ipynb b/lab.ipynb index 56262e7..09f85da 100644 --- a/lab.ipynb +++ b/lab.ipynb @@ -14,21 +14,116 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total tokens: 8045\n", + "unique types: 2007\n" + ] } - }, - "outputs": [], + ], "source": [ - "# your code here" + "from pathlib import Path\n", + "import re \n", + "\n", + "with open(\"book_9.txt\", \"r\") as file:\n", + " text = file.read()\n", + "\n", + "tokens = text.split()\n", + "\n", + "def refine_tokenization(tokens):\n", + " refined_token = []\n", + " for token in tokens:\n", + " refined_token.extend(re.findall(r\"\\w+|\\W+\", token))\n", + " return refined_token\n", + "\n", + "refined_tokens = refine_tokenization(tokens)\n", + "\n", + "refinded_tokens = [token.lower() for token in refined_tokens if token.strip()]\n", + "\n", + "total_tokens = len(refined_tokens)\n", + "\n", + "unique_types = len(set(refined_tokens))\n", + "print(f\"total tokens: {total_tokens}\")\n", + "print(f\"unique types: {unique_types}\")\n", + "\n", + "\n", + "# out_dir = Path(\"book_9.txt\")\n", + "\n", + "# outfile = out_dir / Path(\"all_lines.txt\")\n", + "\n", + "# if not out_dir.exists():\n", + "# out_dir.mkdir()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Amount of Tokens: 6486\n", + "Amount of Token Types: 2046\n", + "Types of Tokens: {'rise', 'horrid', 'Juno', 'Rolled', 'public', 'her', 'moved', 'pour', 'leads', 'thought', 'felt', 'promised', 'warn', 'Pluto', 'line', 'deeps', 'his', 'Nestor', 'Or', 'waiting', 'bought', 'fit', 'armies', 'bestowed', 'power', 'gifts', 'Has', 'Venus', 'hands', 'possession', 'inglorious', \"time—'twas\", 'mix', 'repose', 'unmoved', 'Him', \"brother's\", 'figure', 'stayed', 'life', 'lofty', 'example', 'left', 'stretched', 'sleepless', 'verge', 'end', 'scorn', 'arms', 'sat', 'Full', 'stratagem', 'Unseen', 'accomplish', 'So', 'march', 'proffered', 'soothes', 'repentance', 'cannot', 'ah', 'vain', 'flamed', 'acclamations', 'once', 'hence', 'reversed', 'Thetis', 'thus', 'anger', 'your', 'contains', 'lovely', 'prowess', 'Cursed', 'means', 'wrath', 'interval', 'thrown', 'implore', 'learn', 'behind', 'ships', 'deserve', 'gentler', 'You', 'both', 'barren', 'Life', \"Tydeus'\", 'crime', 'withstood', 'majestic', 'Prayers', 'smoking', 'ask', 'starting', 'joyful', 'approve', 'sit', 'canisters', 'Delightful', 'gain', 'sprung', 'lost', 'troops', 'voyage', 'Fear', 'thirst', 'fears', 'dared', 'against', 'defame', 'porches', 'decree', 'herds', 'erect', 'resigned', 'entire', 'these', 'monstrous', 'sink', 'frozen', 'hear', 'skill', 'careful', \"world's\", 'found', 'due', 'amongst', 'long', 'leaders', 'Lest', 'Its', 'commends', 'moves', 'My', 'wars—', 'appears', 'meditates', 'Hailed', 'reverend', 'chose', 'blame', 'my', 'sails', 'quit', \"'\", 'Althæa', 'curses', 'Thee', 'All', 'Silent', 'Briseïs', 'transport', 'tyrant', 'admired', 'unmatched', 'grasped', 'showed', 'breathed', 'fixed', 'complain', 'noble', 'plenty', 'slaughter', 'birth', 'haughty', 'wisest', 'tides', 'brother', 'bay', 'Trust', 'yield', 'expect', 'wall', 'Heaps', 'draws', 'glory', 'punish', 'frantic', 'exceed', 'vengeance', '\"', 'father', 'have', 'fat', 'sacrifice', 'Return', 'grief', 'mutual', 'prayers', 'dark', 'great—', 'impatient', 'One', 'dungeon', 'submissions', 'vex', 'sustains', 'slow', 'days', 'me', 'wings', 'unworthy', 'seven', 'goddesses', 'Each', 'uncle', 'advised', 'impious', 'sent—', 'Those', 'band', 'past', 'fired', 'Theirs', 'fragments', 'bursting', 'indulging', 'Distaste', 'supplicating', 'lords', 'He', 'asks', 'peers', 'believed', 'run', 'love', 'private', 'repasts', 'boasted', 'Than', 'at', 'assembled', 'fields', 'Blessed', 'seed', 'gate', 'God', 'Ranged', 'reason', 'proclaim', 'manners', 'delights', 'blood', 'bend', 'renown', 'field', 'lives', 'charms', 'Pythian', 'Phœnix', 'carpets', 'plead', 'nearest', 'self', 'long-extended', 'cars', 'purest', 'gold', 'brave', 'convey', 'peaceful', 'accents', 'one', 'reply', 'came', 'Hira', 'disdains', 'before', 'forces', 'waste', 'Hear', 'Applauding', 'acres', \"Phœnix'\", 'daughter', '[7]', 'goddess-born', 'reconciling', 'distinction', 'debate', 'undaunted', 'draughts', 'sues', 'growing', 'Down', 'shores', 'thankless', 'blaze', 'warrior-god', 'within', 'proffers', 'wins', 'believe', 'a', 'cleansing', 'urns', 'inner', 'word', 'dishonoured', 'gave', 'subside', 'round', \"Meleager's\", 'fenced', 'proceeds', 'feel', 'For', 'people', 'caressed', 'were', 'death', 'black', 'sweats', 'paid', 'help', 'obeyed', 'Fellows', 'consort', 'cruel', 'direful', 'confirmed', 'swelling', 'various', 'renowned', 'necessity', 'harboured', 'Lesbos', \"daughter's\", 'harmonious', 'atonement', 'danger', 'That', 'beneath', 'vows', 'stood', 'grace', 'Agamemnon', 'joined', 'man', 'cursed', 'To', 'banquets', 'few', 'purple', 'couch', 'Myself', 'aid', 'rams', 'just', 'Automedon', 'witness', 'restore', 'Go', 'desert', 'attends', 'threw', 'gloomy', 'grieved', 'brink', 'beast', 'other', 'nod', 'Offending', 'let', 'guiltless', 'Grecian', 'Proud', 'return', '[13]', 'Due', 'draw', 'make', 'send', 'ill', 'Shouts', 'In', 'of', 'saddened', 'sober', 'accent', 'Tell', 'might', 'redeem', 'Æpea', 'mien', 'Ah', 'relentless', 'void', 'ruthless', 'air', 'beholds', 'ceased', 'wilt', 'treasures', 'spread', 'Cardamylé', 'poor', 'begin', 'detests', 'oh', 'army', 'wines', 'some', 'kings', 'abodes', 'flight', 'haughtiest', 'Returns', 'live', '\"Divine', 'allay', 'poniard', 'How', 'tempest', 'divine', 'farther', 'Oh', 'matchless', 'divides)', 'Heavens', 'flame', 'given', \"o'er\", 'because', 'hateful', 'Pedasus', '\"Health', 'Durst', 'Pure', 'parting', 'Till', 'vilest', 'horsemen', 'again', 'mournful', 'an', 'Untouched', 'brazen', 'transfixes', 'constant', 'slowly', 'north', 'resumed', 'mistake', 'sounding', 'another', 'them', 'who', 'virtues', 'none', \"hell's\", 'guarded', 'prepared', 'strews', 'heads', \"Hector's\", 'retain', 'kind', 'unresisted', 'enmities', 'design', 'spacious', 'law', 'powers', 'not—a', \"navy's\", 'arm', 'toil', 'canvas', 'hardly', 'monster', 'attend', 'yields', 'better', 'feeble', '[9]', 'Grecians', 'confines', 'warm', 'owns', 'tall', 'Antheia', 'followed', 'What', 'first', 'Gifts', 'sides', 'noblest', 'worthy', 'Repel', 'watch', 'speak', 'mourned', 'whispers', 'Give', 'years', 'proud', 'heaven', 'fury', 'shew', 'Learn', 'doubtful', 'passions', '\"Ulysses', 'head', 'deprecate', 'old', 'reached', 'sire', 'Thou', 'sue', 'retire', 'fought', 'heaps', 'sends', 'gods', 'impudence', 'fled', 'fountains', 'golden', 'Fain', 'Scæan', 'Like', 'many', 'behest', 'king', 'stand', 'goblet', 'too', 'mighty', 'vengeful', 'alone', 'pasture', 'nine', 'relate', 'share', 'vital', 'spirit', 'liquid', 'plains', 'halves', 'blue', 'times', 'deaths', 'shouts', 'canst', 'cries', 'sword', 'confess', 'utter', 'infant', 'embrace', 'pleasure', 'Œneus', 'flesh', 'green', 'meaner', 'bear', 'Troy', 'must', 'steeled', 'bids', 'living', 'began', 'heralds', 'course', 'happy', 'nations', 'revered', 'Are', 'battles', 'Cleopatra', 'mine', 'sake', 'two', 'unravished', 'mankind', 'on', 'infants', 'thousand', 'form', 'divides', 'shewn', 'sins', 'friendship', 'gates', \"Althæa's\", 'day', 'wouldst', 'labours', '\"When', 'tusks', 'sinking', 'broke', 'by', 'broken', 'fifty', 'hour', 'hast', 'alike', 'main', 'ascend', 'bulls', 'thought—but', 'Of', 'we', 'but', 'shake', 'Patroclus', 'foaming', 'hate', 'ever-pleasing', 'martial', 'bred', 'warmer', 'less', 'favoured', 'rule', 'heaped', 'justice', 'conveyed', 'sunk', 'Thus', 'Let', 'Just', 'lord', 'erred', 'honoured', 'guards', 'Inspires', 'vines', 'portion', 'piles', 'wrinkled', 'bestows', 'Alike', 'eyes', 'fleet', 'native', 'many-peopled', 'thither', 'tongue', 'His', 'Content', 'chance', \"stranger's\", 'swear', 'loaded', 'fought—', 'She', 'hears', 'Accept', 'horrors', 'But', 'Through', 'streams', 'Some', 'boar', 'give', 'harvests', \"daughters'\", 'cheats', 'millions', 'fair', 'follow', 'heavy', 'oars', '[8]', 'spoil', 'fortify', 'entreat', 'sage', 'prey', 'Words', 'own', 'Infernal', 'vessels', 'speed', 'ear', 'worth', \"'My\", 'Peleus', 'drowned', 'immense', 'spoke', 'dissuade', 'calm', 'Ships', 'contention', 'asserts', 'shades', 'choose', 'presides', 'Hell', 'scene', 'deliverer', 'scenes', 'angry', 'Meantime', 'Convinced', 'liked', 'Pleased', 'heaven-protected', 'betimes', 'ten', 'distract', 'extent', 'half', 'not', 'terms', 'wrapped', 'full', \"Pthia's\", 'ruddy', 'call', 'deeds', 'food', 'still', 'belong', 'paints', 'youthful', 'sea', 'seas', 'Beneath', 'vow', 'Slave', 'favouring', 'haste', 'loads', 'subdue', 'morning', 'Besought', 'discharged', 'long-contested', 'close', 'renounce', 'tossed', 'brass', 'sands', 'Contention', 'implored', 'To-morrow', 'rich', 'durst', \"prince's\", 'all', 'Deïpyrus', 'prepares', 'did', 'move', 'runs', 'approves', 'glows', 'was', 'thing', 'roofs', 'its', 'verdant', 'shore', 'sky', 'wake', 'social', 'Rich', 'you', 'take', 'brand', \"what's\", 'refund', 'faithless', 'Ulysses', 'plunge', 'returned', 'murder', 'dares', 'guests', 'bribe', \"soul's\", 'judgment', 'style', 'endowments', 'Strong', 'dame', 'fierce', 'foes', 'Gods', 'trench', 'beats', 'hope', 'On', 'luckless', 'sighs', 'murmuring', 'Should', 'swine', 'ravished', 'ordered', 'ours', 'homicide', 'These', 'Ascalaphus', 'chines', 'humble', 'coast', 'Lives', 'orders', 'eye', 'decline', 'iron', 'yonder', 'forgets', 'employ', 'camps', 'bade', 'pass', 'chiefs', 'sound', 'wine', 'heaven-bred', 'fathers', 'embattled', 'heroes', 'free', 'eased', 'stores', 'Superior', 'though', 'comfort', 'mourn', 'councils', 'An', 'Mix', 'lightning', 'hair', 'comrade', 'unconfined', 'fortunes', 'not——but', 'Hellespont', 'restored', 'suppressed—', 'neighbour', 'vied', 'loves', 'winged', 'well-wrought', 'theirs', 'no', 'fruitful', 'near', 'frame', 'bounties', 'tore', 'drew', 'talents', 'deride', \"Agamemnon's\", 'flow', 'beard', 'mortal', 'faults', 'conspire', 'linen', 'shared', 'saved', 'Yet', 'changed', 'briny', 'it', 'Permit', 'A', 'partake', 'greedy', 'nobler', 'Thracian', 'Fate', 'Liked', 'As', 'wives', 'single', 'office', 'owned', 'Convened', 'Two', 'suffering', 'Rest', 'unconquered', 'burst', 'resentment', 'brought', 'Stand', 'most', 'nor', 'source', 'profound', 'spouse', 'fire', 'suit', 'frenzy', 'Lo', 'compassion', 'strength', 'forth', 'engage', 'conduct', 'fates', 'flames', 'daily', 'wide', 'soft', 'Injustice', 'view', 'remain', 'performed', 'remains', 'rejects', 'empress', 'pours', 'mercy', 'Bid', 'detested', 'silence', 'shade', 'to', 'dust', 'tear', 'drenched', 'avenging', 'rocky', 'if', 'commerce', 'regain', 'skies', 'When', 'tenth', 'surprise', 'glories', 'courage', 'ever', 'removes', 'Us', 'turns', 'Trojan', 'maid', 'commands', 'dire', 'care', 'Myrmidonian', \"wife's\", 'mixed', 'led', 'needs', 'greet', 'triumphs', '\"Well', 'only', 'mountains', 'bowls', 'stern', 'consists', 'descending', 'headlong', 'since', 'sanctify', 'Against', 'present', 'heifers', 'attract', 'wars', 'bold', 'Unfit', 'afflicted', 'common', '[6]', 'Welcome', 'hosts', 'friendly', '[12]', 'act', 'roof', 'sheep', 'Priests', 'lips', 'prevent', 'monarch', 'Achilles', 'Would', 'board', 'Ætolians', '(Rich', 'sweat', '\"Why', 'They', 'whom', 'kept', 'men', 'Aphareus', 'digest', 'thunder', 'richest', 'always', 'If', \"hero's\", 'travels', 'While', 'shameless', 'Enjoy', 'deathless', 'steel', 'short', 'Pthia', 'purchased', 'Their', 'damsel', 'bowl', 'great', 'rapine', 'dawning', 'commissioned', 'thunders', 'Fair', 'Skilled', 'force', 'know', 'with', 'stands', 'want', 'seat', 'ramparts', 'rest', 'disdained', 'bed', 'feasts', 'hero', 'tears', 'Then', 'glide', 'pensive', 'flowing', 'helpless', 'soul', 'Thy', 'beauteous', '[1]', 'bribes', 'late', 'success', 'Pronounce', 'double', 'costly', 'numerous', 'madness', 'Merion', 'choice', \"Jove's\", 'embassy', 'conquered', 'where', 'refresh', 'part', 'fleetness', 'ignobly', 'valued', 'scaled', 'tell', 'ire', 'issuing', 'country', 'fond', 'rose', 'hell', 'state', 'aside', 'price', 'wise', 'when', 'toils', 'young', 'affords', 'heaven-defended', 'Curetes', 'Cynthia', 'opposed', 'unrivalled', 'dies', 'Deceived', 'water', 'enjoy', \"Priam's\", 'brawny', 'empires', 'then', 'best', 'dye', '\"My', 'Ætolia', 'dismay', 'way', 'Wise', 'shining', 'has', 'bird', 'weighty', 'messenger', 'Whose', 'unkind', 'larger', 'Do', 'overthrown', 'Fight', \"'tis\", 'waits', \"Pylos'\", 'interests', 'be', 'ye', 'listening', 'song', 'Which', 'hard', 'Much', 'slaves', 'above', 'extend', 'regal', 'genial', 'hereditary', '(when', 'wrongs', \"I'll\", 'uninjured', 'brightened', 'town', 'world', 'combine', 'repast', 'turn', 'wait', 'property', 'chosen', 'face', 'slept', 'battle', 'deed', 'Proserpine', 'twice', 'Besides', 'Beloved', 'Lyrnessian', 'below', \"Orestes'\", 'duty', 'libation', 'determines', 'laid', 'immortals', 'guide', 'margin', 'steeds', 'go—why', 'destroy', 'strengthened', 'apply', \"rock's\", 'alliance', 'strong', 'daughters', 'lessons', 'slave', 'conquer', 'lay', 'each', '\"Great', 'regretted', 'west', 'sleep', 'heavenly', '\"Oh', 'reward', 'see', 'detain', 'fiercer', 'alternate', 'greater', 'silent', 'sacred', 'pair', '\"O', 'strain', 'report', 'pale', 'oracles', 'opposite', 'blessings', 'replied', 'should', 'bands', 'hundred', \"o'ershades\", 'sandy', 'him', 'never', 'thou', 'deep', 'fits', 'Unworthy', 'more', 'declared', 'vast', 'Swells', 'curb', 'wed', 'fiends', 'forced', 'sons', 'boards', 'strive', 'circle', 'Haste', 'praise', 'Bold', 'Though', 'watchings', 'wreak', 'commit', 'amidst', 'this', 'watched', 'they', 'good', \"Atrides'\", \"mother's\", 'untasted', 'advise', 'advice', 'Once', 'parts', 'woes', 'defence', 'proposals', 'languid', \"Ilion's\", 'fleeces', 'conquering', 'date', 'Idas', 'Ithacus', 'knee', 'guardian', 'presents', 'check', 'crowned—', 'wrest', \"tyrant's\", 'may', 'Dolopians', 'grizly', 'lifted', 'heart', 'exhaustless', 'the', 'flowery', 'prize', 'unseen', 'received', 'flows', 'former', \"Tydides'\", 'prerogative', 'matrons', 'swelled', 'bow', 'Himself', 'nymphs', 'furies', 'Half', 'porket', 'displayed', 'joy', 'cloudy', 'I', 'Greece', 'light', 'justly', \"Achilles'\", 'truth', 'molest', 'obey', \"Atreus'\", 'atone', 'remit', 'wisdom', 'ways', 'join', 'new', 'control', 'resign', 'cease', 'shameful', 'hostile', \"Thebes'[11]\", 'truly', '\"With', 'sail', 'Thrasymed', 'fell', 'crown', 'perils', 'With', 'From', \"harp's\", 'levelled', 'lust', 'becomes', 'supplies', 'refulgent', 'red', 'placed', 'art', 'braved', 'feels', 'undetermined', 'palaces', 'copious', 'Burns', 'See', 'crowned', 'time', 'oxen', 'yon', 'stream', 'dower', 'Wronged', 'extirpate', 'Nor', 'space', 'owed', 'Her', 'how', 'desired', 'declare', 'hold', 'divide', 'roar', 'wretch', 'offers', 'Thessalian', 'defies', 'unfold', 'dread', 'Ajax', 'Stern', '\"Princes', 'Along', 'fly', 'sable', 'thenceforth', 'impulse', 'sorrows', 'suppliant', 'hated', 'unfruitful', 'offend', 'ratify', 'like', 'conquest', 'himself', 'Passed', 'held', 'despised', 'Old', 'medium', 'sign', 'tent', 'withhold', 'compliant', 'tedious', 'adored', 'fill', 'prince', 'Jove', 'generous', \"Laertes'\", 'counsels', 'sadness', 'which', 'youth', 'friend', 'body', 'wrathful', '\"Say', 'No—let', 'recruit', 'Consume', 'Fears', 'ambassadors', 'Sat', 'number', 'shalt', 'Tydides', 'secret', 'boiling', 'dangers', 'scorns', \"Creon's\", 'stop', 'Were', 'hearts', 'till', 'cheek', 'Who', 'basely', 'peace', 'Made', 'Cardamyé', 'consternation', 'merits', 'morn', 'sweet', 'next', 'Chrysothemis', 'omens', 'our', 'deaf', 'flies', 'three', 'ancient', 'O', 'Thither', 'done', 'bravely', 'Last', 'fires', 'ready', \"people's\", 'failed', '\"Hence', 'prudent', 'pastures', 'worst', 'scarce', 'or', 'Short', 'save', 'passion', 'Between', 'add', 'revenges', 'Ye', 'whole', 'here', 'redress', 'nights', 'Revere', 'mediate', 'raise', 'Kings', 'cause', 'towers', 'woe', 'unrelenting', 'arts', 'stubborn', 'gift', 'strife', 'direct', 'Sure', 'torrent', 'mound', '[10]', 'gates—', 'resounds', 'regard', '\"Patroclus', 'bloom', 'son', 'fate', 'Æacides', 'possessions', 'twelve', 'Sthenelus', 'retired', 'Amused', 'would', 'Achaian', 'Forbids', 'swift', 'aged', 'Strikes', 'Atrides', 'understood', 'Hector', 'their', 'Iälmen', 'heard', 'high', 'Lesbian', 'justified', 'bleed', 'approach', 'night', 'god', 'do', '[14]', 'find', 'labouring', 'hairs', 'walls', 'cares', 'pernicious', 'shun', 'arise', 'oppressed', 'bore', 'honours', 'almighty', 'weak', 'bring', 'rage', 'rosy', 'War', 'billows', '\"Then', 'slain', 'already', \"Apollo's\", 'wiser', 'urgent', 'reproach', 'tents', 'Rise', 'tried', 'for', 'Our', 'dawns', 'fruit', 'horror', 'Pelides', 'approved', 'chief', 'Twelve', 'disclosed', \"sage's\", 'commencing', 'grant', 'ray', 'instant', \"'Ye\", 'divinely', 'piercing', 'behold', 'Regard', 'inflamed', 'Greeks', 'navy', 'motion', 'No', 'libations', 'feast', 'tramples', 'safe', '[4]', 'can', 'buy', 'braves', 'soberly', 'gathered', 'leave', 'instructs', 'coals', 'trust', 'as', 'nuptials', 'is', 'away', 'thy', 'search', 'advance', 'enriched', 'sacked', 'every', 'globe', 'contain', 'demand', 'burns', 'child', 'appear', 'offered', 'ampler', 'armed', 'bid', 'Leaped', 'humbles', 'murderer', 'urged', 'wrought', 'princes', 'Swift', 'Since', 'around', 'vineyards', 'war', 'ourselves', 'Not', 'expire', \"'twas\", 'fleets', 'Ten', 'slights', 'prizes', 'silver', 'fame', 'mind', 'surround', 'ground', 'consigned', 'us', 'injures', 'Sweeps', 'there', 'yet', 'open', 'Confiding', 'Pheræ', 'will', 'Alcyone', 'prevailed', 'acclaim', 'humanity', 'thence', \"e'en\", 'now', 'Orchomenian', 'offspring', 'Iphigenia', 'rite', 'glorious', 'honour', 'Such', 'souls', 'saving', 'The', 'heal', 'last', \"Helen's\", 'feet', 'seats', 'faith', \"Thracia's\", 'sprinkles', 'message', '[3]', 'spring', 'load', 'sense', 'whether', 'thundering', 'Greek', 'interest', 'Marpessa', 'offerings', 'aids', 'third', 'wholesome', 'Laodicé', 'royal', 'along', 'dames', 'dreadful', 'gales', 'right', 'stupid', 'solemn', 'Never', 'side', 'repine', 'warrior', 'stay', 'race', 'breast', 'charm', 'dealt', \"Œneus'\", 'resound', 'vanquished', 'graze', 'blessed', 'those', 'delegates', 'Seven', 'dusty', 'begun', 'pride', 'pains', 'wealth', 'rests', \"father's\", 'land', 'forgives', 'sway', 'white', 'rekindles', 'Great', 'faithful', 'spied', 'celestial', 'hand', 'sisters', '\"She', 'elders', 'in', 'glowing', 'favours', 'flaming', \"Menœtius'\", 'reared', 'Now', 'raging', 'goblets', 'soldiers', 'appeared', 'Is', 'afford', \"he'll\", 'spreads', 'win', 'mould', 'point', 'wondrous', 'retains', 'salt', 'reign', 'suggest', 'forests', 'low', 'bless', 'Placed', 'kingly', 'twenty', 'spirits', 'dejected', 'pays', 'and', 'dear', 'dead', 'drawn', 'bright', 'wife', 'slew', 'soft-trickling', 'she', 'disdain', 'seized', 'impetuous', 'them—to', 'Lost', 'suffered', \"o'erleaped\", 'more—three', 'repressed', 'whose', 'bears', 'props', 'Meleager', 'Next', 'calls', 'melt', 'Here', 'states', 'possessed', 'troubled', 'deceived', 'seek', 'through', 'repaid', 'vases', 'tributary', 'consult', 'answer', 'virtuous', 'hail', 'that', 'deprives', 'tremble', 'to-night', 'human', 'Heaven', '\"Cursed', 'listened', 'early', \"victors'\", 'ample', 'Curetian', 'hardships', 'mortals', 'harp', 'attention', 'he', 'unhappy', 'invades', 'ruler', 'designed', 'neglected', 'demands', 'hunger', 'There', 'others', 'Constant', 'vase', 'nightly', 'immortal', 'knows', 'spares', 'addressing', 'what', 'lo', 'suits', 'tender', 'Egyptian', 'embraced', 'squadrons', 'polished', 'sings', \"'Tis\", 'warlike', 'front', 'whatever', 'are', 'lustful', 'so', 'portal', 'desire', 'Enopé', 'enjoys', 'ill-matched', 'name', 'Ægean', 'Lame', 'affection', 'cast', 'shall', 'spears', 'purpose', 'excel', 'decrees', 'Spread', 'such', 'laws', 'debates', 'Minerva', 'hoary', 'made', 'arose', ')', 'port', 'guides', 'waters', 'train', 'defends', 'woman', 'called', 'maintained', 'host', 'meditate', 'friends', 'works', '\"Fixed', \"Nestor's\", 'elect', 'length', 'fights', 'unpitying', 'blazing', 'enslaved', 'whitening', 'age', 'Neptune', 'pre-eminence', 'pray', 'thee', 'Your', 'mourning', 'quarrel', 'Be', 'declares', \"general's\", 'Descends', 'drives', 'same', 'tripods', '[2]', 'wronged', 'teach', 'shine', 'arrived', 'throne', 'Hodius', 'Iphis', 'guard', 'resolves', 'embers', 'sight', 'shakes', 'thyself', 'manly', \"life's\", 'beheld', \"Argos'\", 'goat', 'elected', 'earth', 'Age', 'had', 'valour', \"Pelides'\", 'dominion', 'Meanwhile', 'Assembled', 'council', 'godlike', 'foe', 'could', 'By', 'propitious', 'Eurybates', 'ease', \"woman's\", '\"Where', 'Safe', 'fight', 'vulgar', 'voice', 'fact', 'partial', 'unperceived', 'Calydon', 'victorious', 'lies', 'unskilled', 'designs', 'sent', 'joys', 'Disgraced', 'Amidst', 'aspires', 'beat', 'loud', \"country's\", '\"Monarch', 'Sycros', 'realms', 'lead', 'proposed', 'Diomedé', 'And', 'Above', 'charmed', 'purer', 'think', 'bent', 'Long', 'revenge', 'treaties', 'combat', 'slumbers', 'waves', 'youngest', 'fear', 'Despair', 'fatal', 'Thebæ', 'superior', 'fall', 'walked', 'fragrant', 'thine', 'rolling', 'well', 'said', 'conceived', 'grow', 'sceptres', 'from', '\"Ye', 'while', 'grateful', 'second', 'store', 'At', 'effect', 'than', 'cities', 'spoils', 'glittering', 'claim', 'soil', 'wall[5]', 'unequal', 'unfinished', 'sorrow', 'threatening', 'This', \"leader's\", 'rising', 'speech', 'court', 'conquests', 'bread', 'fiercé', 'Lycomed', 'ship', 'sure', 'large', 'cold', 'Still', 'wan', 'oppose', 'took', 'equal', 'Can', 'shame', 'distressed', 'words', 'plain', 'unsullied', 'taught', 'First', 'assuage', 'Pallas', 'turrets', 'lodge', 'Pray', 'general', 'come', 'captives', 'command', 'laborious', 'dare', 'shed', 'Attend', 'Ætolian', 'wonders', \"mortal's\", 'warriors'}\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import re\n", + "\n", + "#opening and reading the file\n", + "with open(\"book_9.txt\", \"r\") as file:\n", + " text = file.read()\n", + "\n", + "#make a list of all the words including regex \n", + "tokens = re.findall(r\"[^\\s.,!?;:]+\", text)\n", + "\n", + "#FAILED ATTEMPT AT TRYING TO EXCLUDE PUNCTUATION AT THE END\n", + "#use list comprehension to create a new list and populate it with strip()\n", + "# tokens = [token.strip() for token in tokens]\n", + "print(f\"Amount of Tokens: {len(tokens)}\")\n", + "print(f\"Amount of Token Types: {len(set(tokens))}\")\n", + "print(f\"Types of Tokens: {set(tokens)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 1\n", + "#### Grappling w Regex\n", + "One of the first challenges was the constructing the correct regex expression, as I wanted tried [\\w]+[\\W] but the tokens still had punctuation and white space. So I tried \\S+ which removed the white space but not the puncuation.\n", + "I then resolved to the strip function in a for loop through tokens, but the tokens weren't changing. This is because I was not altering the actual string. As a result, I populated a new lists while looping through the old and simultaneously removing any punctuation. However the punctuation was still there so I changed the regex call to find punctuation and white space first then create the token after.\n", + "### Question 2\n", + "There are 6486 Tokens and 2046 token types" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4,