-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathrun_linux.sh
More file actions
executable file
·97 lines (86 loc) · 3.6 KB
/
run_linux.sh
File metadata and controls
executable file
·97 lines (86 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
# Stop script on any error code and trap errors for easier debugging
set -eE
trap 'echo >&2 "Error - exited with status $? at line $LINENO"' ERR
# Step 0: Parse any arguments we care about
DATABASE_DIR="."
WIKI_DATA_SET_DIR="$DATABASE_DIR/wiki-dataset"
TXTAI_WIKIPEDIA_DIR="$DATABASE_DIR/txtai-wikipedia"
OTHER_ARGS=()
function help() {
echo "usage: $0 [-h] [-d DATABASE_DIR]"
echo
echo "Offline Wikipedia Text API"
echo
echo "options:"
echo "-h, --help show this help message and exit"
echo "-d DATABASE_DIR, --database_dir DATABASE_DIR"
echo " Base directory containing the wiki-dataset and txtai-wikipedia"
echo " folders."
}
while [[ $# -gt 0 ]]; do
case $1 in
--database_dir|-d)
DATABASE_DIR="$2"
WIKI_DATA_SET_DIR="$DATABASE_DIR/wiki-dataset"
TXTAI_WIKIPEDIA_DIR="$DATABASE_DIR/txtai-wikipedia"
shift 2
;;
--help|-h)
help
exit 0
;;
*)
# For any unrecognized args, store them to pass through
OTHER_ARGS+=("$1")
shift
;;
esac
done
# Step A: Create and activate a Python virtual environment
echo Creating virtual environment
if [ ! -d "venv" ]; then
python -m venv venv
else
echo Existing venv detected. Activating.
fi
echo Activating virtual environment
source venv/bin/activate
# Step B: Install requirements from requirements.txt
echo ---------------------------------------------------------------
echo Installing python requirements from requirements.txt
pip install --upgrade pip
pip install -r requirements.txt
# Step C: Clone the git repository for full wiki articles into a directory called "wiki-dataset"
echo ---------------------------------------------------------------
echo Downloading Wikipedia dataset. As of 2025-07-13, this is about 46GB
if [ ! -d "$WIKI_DATA_SET_DIR" ]; then
# Clone with Git LFS support
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/NeuML/wikipedia-20250620 "$WIKI_DATA_SET_DIR"
echo "Pulling LFS files for wiki dataset (this may take a while)..."
cd "$WIKI_DATA_SET_DIR" && git lfs pull && cd - || echo "LFS pull failed for wiki dataset"
else
echo Existing wiki-dataset directory detected.
echo "Checking for LFS files in wiki dataset..."
cd "$WIKI_DATA_SET_DIR" && git lfs pull && cd - || echo "LFS pull failed for wiki dataset"
fi
# Step D: Clone the git repository for txtai wiki summaries into a directory called txtai-wikipedia
echo ---------------------------------------------------------------
echo Downloading txtai-wikipedia dataset. As of 2025-07-13, this is about 15GB.
if [ ! -d "$TXTAI_WIKIPEDIA_DIR" ]; then
# Clone with Git LFS support
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/NeuML/txtai-wikipedia "$TXTAI_WIKIPEDIA_DIR"
echo "Pulling LFS files for txtai dataset (this may take a while)..."
cd "$TXTAI_WIKIPEDIA_DIR" && git lfs pull && cd - || echo "LFS pull failed for txtai dataset"
else
echo Existing txtai-wikipedia directory detected.
echo "Checking for LFS files in txtai dataset..."
cd "$TXTAI_WIKIPEDIA_DIR" && git lfs pull && cd - || echo "LFS pull failed for txtai dataset"
fi
# Finally: Start the API
echo ---------------------------------------------------------------
echo Starting API. If this is the first run, setup may take 10-15 minutes depending on your machine.
echo Setup time is due to indexing wikipedia article titles into a json file for API speed.
echo ---------------------------------------------------------------
echo API Starting...
python start_api.py --database_dir "$DATABASE_DIR" "${OTHER_ARGS[@]}"