diff --git a/A4/README-A4.md b/A4/README-A4.md new file mode 100644 index 0000000..b1d84b0 --- /dev/null +++ b/A4/README-A4.md @@ -0,0 +1,42 @@ +# A4 - Markov + +Generates random text based on training data. + + +## Usage + +### Synopsis + +```bash +juwei95-Markov.py [-h] [-o k] [-w] [-s seed] [filename] +``` + +### Positional arguments + +| Argument | Description | +| ---------- | ------------------------------------------------------------------------------------ | +| `filename` | The input file containing the training data, if omitted the input is read from stdin | + +### Options + +| Opt | Option | Description | +| --------- | ------------- | -------------------------------------------------------------------------------------------------------------------- | +| `-h` | `--help` | Show a help message and exit | +| `-o k` | `--order k` | Use Markov order k | +| `-w` | `--words` | Use word-based generation instead of character-based generation | +| `-s seed` | `--seed seed` | Initialize the random number generator with seed. Same input with the same seed will always produce the same output. | + + +### Example + +```bash +python3 juwei95-Markov.py erlkoenig.txt -o 5 +``` + + +## Details + +* The next output token is randomly selected from a list of non-unique possible continuations for the current context. The chance of a continuation beeing selected is implicitly determined by its number of occurences in the training data, after the given context. +* ⚠️ ***Warning***: There is no hard limit on the amount of text produced. Chances are that the generator will not terminate by itself. You may have to use `ctrl + c` to kill it. +* Implemented in python using only standard library modules. +* Tested using python 3.12.12 on Ubuntu 24.04.3 LTS under WSL. diff --git a/A4/juwei95-Markov.py b/A4/juwei95-Markov.py new file mode 100644 index 0000000..8de7a60 --- /dev/null +++ b/A4/juwei95-Markov.py @@ -0,0 +1,71 @@ +import argparse +import sys +import random + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(prog='juwei95-Markov.py', description='Generates random text based on training data') + parser.add_argument('-o', '--order', action='store', metavar='k', default=1, type=int, help='use Markov order k') + parser.add_argument('-w', '--words', action='store_true', help='use word-based generation instead of character-based generation') + parser.add_argument('-s', '--seed', action='store', metavar='seed', default=0, help='initialize the random number generator with seed') + parser.add_argument('filename', action='store', nargs='?', help='input file name containing training text - if omitted, input is read from stdin') + args = parser.parse_args() + if args.order <= 0: + print(f"Invalid order of {args.order}, order must be > 0!", file=sys.stderr) + exit(-1) + return args + +def parse_file(args: argparse.Namespace): + if args.filename: + infile = open(args.filename) + else: + infile = sys.stdin + tokens = [] + for line in infile: + if args.words: + tokens.extend(filter(lambda word: word != "", line.split())) + else: + tokens.extend(list(line)) + if args.filename: + infile.close() + return tokens + +def build_continuation_map(tokens: list[str], args: argparse.Namespace) -> dict[tuple[str], list[str]]: + continuation_map: dict[tuple[str], list[str]] = {} + for pos in range(len(tokens) - args.order): + key = tuple(tokens[pos:pos + args.order]) + value = tokens[pos + args.order] + if key in continuation_map: + continuation_map[key].append(value) + else: + continuation_map[key] = [value] + return continuation_map + +def print_output_token(token: str, args: argparse.Namespace): + print(token, end=" " if args.words else "") + +def generate_text(continuation_map: dict[tuple[str], list[str]], tokens: list[str], args: argparse.Namespace): + random.seed(args.seed) + context = tokens[:args.order] + for token in tokens[:args.order]: + yield token + while tuple(context) in continuation_map: + continuations = continuation_map[tuple(context)] + next_token = continuations[random.randrange(len(continuations))] + context.pop(0) + context.append(next_token) + yield next_token + +def main(): + args = parse_args() + tokens = parse_file(args) + continuation_map = build_continuation_map(tokens, args) + try: + for token in generate_text(continuation_map, tokens, args): + print_output_token(token, args) + except KeyboardInterrupt: + pass + print() + +if __name__ == "__main__": + main()