Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions A4/README-A4.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# A4 - Markov

Generates random text based on training data.


## Usage

### Synopsis

```bash
juwei95-Markov.py [-h] [-o k] [-w] [-s seed] [filename]
```

### Positional arguments

| Argument | Description |
| ---------- | ------------------------------------------------------------------------------------ |
| `filename` | The input file containing the training data, if omitted the input is read from stdin |

### Options

| Opt | Option | Description |
| --------- | ------------- | -------------------------------------------------------------------------------------------------------------------- |
| `-h` | `--help` | Show a help message and exit |
| `-o k` | `--order k` | Use Markov order k |
| `-w` | `--words` | Use word-based generation instead of character-based generation |
| `-s seed` | `--seed seed` | Initialize the random number generator with seed. Same input with the same seed will always produce the same output. |


### Example

```bash
python3 juwei95-Markov.py erlkoenig.txt -o 5
```


## Details

* The next output token is randomly selected from a list of non-unique possible continuations for the current context. The chance of a continuation beeing selected is implicitly determined by its number of occurences in the training data, after the given context.
* ⚠️ ***Warning***: There is no hard limit on the amount of text produced. Chances are that the generator will not terminate by itself. You may have to use `ctrl + c` to kill it.
* Implemented in python using only standard library modules.
* Tested using python 3.12.12 on Ubuntu 24.04.3 LTS under WSL.
71 changes: 71 additions & 0 deletions A4/juwei95-Markov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import argparse
import sys
import random


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(prog='juwei95-Markov.py', description='Generates random text based on training data')
parser.add_argument('-o', '--order', action='store', metavar='k', default=1, type=int, help='use Markov order k')
parser.add_argument('-w', '--words', action='store_true', help='use word-based generation instead of character-based generation')
parser.add_argument('-s', '--seed', action='store', metavar='seed', default=0, help='initialize the random number generator with seed')
parser.add_argument('filename', action='store', nargs='?', help='input file name containing training text - if omitted, input is read from stdin')
args = parser.parse_args()
if args.order <= 0:
print(f"Invalid order of {args.order}, order must be > 0!", file=sys.stderr)
exit(-1)
return args

def parse_file(args: argparse.Namespace):
if args.filename:
infile = open(args.filename)
else:
infile = sys.stdin
tokens = []
for line in infile:
if args.words:
tokens.extend(filter(lambda word: word != "", line.split()))
else:
tokens.extend(list(line))
if args.filename:
infile.close()
return tokens

def build_continuation_map(tokens: list[str], args: argparse.Namespace) -> dict[tuple[str], list[str]]:
continuation_map: dict[tuple[str], list[str]] = {}
for pos in range(len(tokens) - args.order):
key = tuple(tokens[pos:pos + args.order])
value = tokens[pos + args.order]
if key in continuation_map:
continuation_map[key].append(value)
else:
continuation_map[key] = [value]
return continuation_map

def print_output_token(token: str, args: argparse.Namespace):
print(token, end=" " if args.words else "")

def generate_text(continuation_map: dict[tuple[str], list[str]], tokens: list[str], args: argparse.Namespace):
random.seed(args.seed)
context = tokens[:args.order]
for token in tokens[:args.order]:
yield token
while tuple(context) in continuation_map:
continuations = continuation_map[tuple(context)]
next_token = continuations[random.randrange(len(continuations))]
context.pop(0)
context.append(next_token)
yield next_token

def main():
args = parse_args()
tokens = parse_file(args)
continuation_map = build_continuation_map(tokens, args)
try:
for token in generate_text(continuation_map, tokens, args):
print_output_token(token, args)
except KeyboardInterrupt:
pass
print()

if __name__ == "__main__":
main()