From 992bb46784bc86dd7e264ed87e7753f8b7fd1bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julianne=20Wei=C3=9F?= Date: Fri, 1 May 2026 15:06:03 +0200 Subject: [PATCH 1/3] Add A4 solution --- A4/juwei95-Markov.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 A4/juwei95-Markov.py diff --git a/A4/juwei95-Markov.py b/A4/juwei95-Markov.py new file mode 100644 index 0000000..d61eec5 --- /dev/null +++ b/A4/juwei95-Markov.py @@ -0,0 +1,67 @@ +import argparse +import sys +import random + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(prog='juwei95-Markov.py', description='Generates random text based on training data') + parser.add_argument('-o', '--order', action='store', metavar='k', default=1, type=int, help='use Markov order k') + parser.add_argument('-w', '--words', action='store_true', help='use word-based generation instead of character-based generation') + parser.add_argument('-s', '--seed', action='store', metavar='seed', default=0, help='initialize the random number generator with seed') + parser.add_argument('filename', action='store', nargs='?', help='input file name containing training text - if omitted, input is read from stdin') + args = parser.parse_args() + if args.order <= 0: + print(f"Invalid order of {args.order}, order must be > 0!", file=sys.stderr) + exit(-1) + return args + +def parse_file(args: argparse.Namespace): + if args.filename: + infile = open(args.filename) + else: + infile = sys.stdin + tokens = [] + for line in infile: + if args.words: + tokens.extend(filter(lambda word: word != "", line.split())) + else: + tokens.extend(list(line)) + if args.filename: + infile.close() + return tokens + +def build_continuation_map(tokens: list[str], args: argparse.Namespace) -> dict[tuple[str], list[str]]: + continuation_map: dict[tuple[str], list[str]] = {} + for pos in range(len(tokens) - args.order): + key = tuple(tokens[pos:pos + args.order]) + value = tokens[pos + args.order] + if key in continuation_map: + continuation_map[key].append(value) + else: + continuation_map[key] = [value] + return continuation_map + +def print_output_token(token: str, args: argparse.Namespace): + print(token, end=" " if args.words else "") + +def generate_text(continuation_map: dict[tuple[str], list[str]], tokens: list[str], args: argparse.Namespace) -> str: + random.seed(args.seed) + context = tokens[:args.order] + for token in tokens[:args.order]: + print_output_token(token, args) + while tuple(context) in continuation_map: + continuations = continuation_map[tuple(context)] + next_token = continuations[random.randrange(len(continuations))] + context.pop(0) + context.append(next_token) + print_output_token(next_token, args) + print() + +def main(): + args = parse_args() + tokens = parse_file(args) + continuation_map = build_continuation_map(tokens, args) + generate_text(continuation_map, tokens, args) + +if __name__ == "__main__": + main() From de567ab7edb9f725fe636708b905c4935a79d45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julianne=20Wei=C3=9F?= Date: Fri, 1 May 2026 16:08:53 +0200 Subject: [PATCH 2/3] Refactor to generator pattern --- A4/juwei95-Markov.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/A4/juwei95-Markov.py b/A4/juwei95-Markov.py index d61eec5..8de7a60 100644 --- a/A4/juwei95-Markov.py +++ b/A4/juwei95-Markov.py @@ -44,24 +44,28 @@ def build_continuation_map(tokens: list[str], args: argparse.Namespace) -> dict[ def print_output_token(token: str, args: argparse.Namespace): print(token, end=" " if args.words else "") -def generate_text(continuation_map: dict[tuple[str], list[str]], tokens: list[str], args: argparse.Namespace) -> str: +def generate_text(continuation_map: dict[tuple[str], list[str]], tokens: list[str], args: argparse.Namespace): random.seed(args.seed) context = tokens[:args.order] for token in tokens[:args.order]: - print_output_token(token, args) + yield token while tuple(context) in continuation_map: continuations = continuation_map[tuple(context)] next_token = continuations[random.randrange(len(continuations))] context.pop(0) context.append(next_token) - print_output_token(next_token, args) - print() + yield next_token def main(): args = parse_args() tokens = parse_file(args) continuation_map = build_continuation_map(tokens, args) - generate_text(continuation_map, tokens, args) + try: + for token in generate_text(continuation_map, tokens, args): + print_output_token(token, args) + except KeyboardInterrupt: + pass + print() if __name__ == "__main__": main() From f9ab4ff91fe03dc9465cd41732a27dddfdf4e191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julianne=20Wei=C3=9F?= Date: Fri, 1 May 2026 16:09:43 +0200 Subject: [PATCH 3/3] Add README --- A4/README-A4.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 A4/README-A4.md diff --git a/A4/README-A4.md b/A4/README-A4.md new file mode 100644 index 0000000..b1d84b0 --- /dev/null +++ b/A4/README-A4.md @@ -0,0 +1,42 @@ +# A4 - Markov + +Generates random text based on training data. + + +## Usage + +### Synopsis + +```bash +juwei95-Markov.py [-h] [-o k] [-w] [-s seed] [filename] +``` + +### Positional arguments + +| Argument | Description | +| ---------- | ------------------------------------------------------------------------------------ | +| `filename` | The input file containing the training data, if omitted the input is read from stdin | + +### Options + +| Opt | Option | Description | +| --------- | ------------- | -------------------------------------------------------------------------------------------------------------------- | +| `-h` | `--help` | Show a help message and exit | +| `-o k` | `--order k` | Use Markov order k | +| `-w` | `--words` | Use word-based generation instead of character-based generation | +| `-s seed` | `--seed seed` | Initialize the random number generator with seed. Same input with the same seed will always produce the same output. | + + +### Example + +```bash +python3 juwei95-Markov.py erlkoenig.txt -o 5 +``` + + +## Details + +* The next output token is randomly selected from a list of non-unique possible continuations for the current context. The chance of a continuation beeing selected is implicitly determined by its number of occurences in the training data, after the given context. +* ⚠️ ***Warning***: There is no hard limit on the amount of text produced. Chances are that the generator will not terminate by itself. You may have to use `ctrl + c` to kill it. +* Implemented in python using only standard library modules. +* Tested using python 3.12.12 on Ubuntu 24.04.3 LTS under WSL.