diff --git a/.gitignore b/.gitignore index ae6b23f..9cb206c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ .env -training_data/ \ No newline at end of file +training_data/ +your_code_base.txt +your_code_base.pdf +myenv/ diff --git a/README.md b/README.md index 921d32b..74244e1 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,45 @@ # git2txt -Convert all files in git repository to .txt files. This is useful for training LLMs on your codebase. +Converts all the files of a git repository into .txt files. It also generates a single .txt & .pdf file containing the whole code base. This is useful for training LLMs on your codebase. ## How to Use 1. Create new .env file by copying example.env + ```shell cp example.env .env ``` + 2. Add necessary fields. The default fields are good to start with. + ```bash -GIT_PROJECT_DIRECTORY=/path/to/git/repo +GIT_PROJECT_DIRECTORY=/path/to/git/repo (ex. C:\Users\MyUserName\Codebases\GitHub\my-project-name) IGNORE_FILES=.env,package-lock.json IGNORE_DIRS=.git,.vscode,node_modules SAVE_DIRECTORY=training_data SKIP_EMPTY_FILES=true + +SOURCE_DIR=training_data +OUTPUT_FILE=your_code_base.txt +PDF_OUTPUT=your_code_base.pdf ``` + 3. Install dependencies. Using a virtual environment is recommended. + ```shell python -m pip install -r requirements.txt ``` -4. Run program + +4. In the "is_text_file" function, you MUST add the extensions of the file you want to be converted. + +5. Run program + ```shell python main.py ``` -5. You'll see your data files in the ```training_data/``` directory. This will be different if you changed the path via ```SAVE_DIRECTORY``` in ```.env``` file. +6. You'll see your data files in the `training_data/` directory. This will be different if you changed the path via `SAVE_DIRECTORY` in `.env` file. ## Notes -- This program requires Python version 3.6 or later. It uses the f-string formatting technique introduced in Python 3.6. \ No newline at end of file + +- This program requires Python version 3.6 or later. It uses the f-string formatting technique introduced in Python 3.6. diff --git a/example.env b/example.env index d84b404..f8cf4e9 100644 --- a/example.env +++ b/example.env @@ -1,5 +1,9 @@ -GIT_PROJECT_DIRECTORY= +GIT_PROJECT_DIRECTORY=C:\Users\jimzord12\Codebases\GitHub\serve-tech IGNORE_FILES=.env,package-lock.json IGNORE_DIRS=.git,.vscode,node_modules SAVE_DIRECTORY=training_data -SKIP_EMPTY_FILES=true \ No newline at end of file +SKIP_EMPTY_FILES=true + +SOURCE_DIR=training_data +OUTPUT_FILE=your_code_base.txt +PDF_OUTPUT=our_code_base.pdf diff --git a/main.py b/main.py index 0d5c801..fadec3a 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,53 @@ import os import hashlib import sys -load_env() +from reportlab.pdfgen import canvas +load_env(env_path=r'.\example.env') + +def is_text_file(file_path): + text_file_extensions = ['.txt', '.md', '.go', '.py', '.java', '.html', '.css', '.js', '.mod', '.sum'] # Add more as needed + return any(file_path.lower().endswith(ext) for ext in text_file_extensions) + +def combine_txt_files_and_create_pdf(source_directory, output_file, pdf_output, separator='**'): + separator_line = separator * 40 + '\n' + + # Initialize a list to store combined text + combined_text = [] + + with open(output_file, 'w', encoding='utf-8') as outfile: + for root, dirs, files in os.walk(source_directory): + for filename in files: + if filename.endswith('.txt'): + file_path = os.path.join(root, filename) + with open(file_path, 'r', encoding='utf-8') as infile: + content = infile.read() + combined_text.append(separator_line) + combined_text.append(f"{filename.center(len(separator_line))}\n") + combined_text.append(separator_line) + combined_text.append(content + '\n') + combined_text.append(separator_line) + + # Write to the TXT file + outfile.writelines([separator_line, f"{filename.center(len(separator_line))}\n", separator_line, content + '\n', separator_line]) + + # Write to the PDF file + c = canvas.Canvas(pdf_output) + text = c.beginText(40, 800) # Starting position + for line in combined_text: + # Split the combined text into lines + for subline in line.split('\n'): + text.textLine(subline.strip()) + if text.getY() < 40: # Move to a new page if there's no space + c.drawText(text) + c.showPage() + text = c.beginText(40, 800) + c.drawText(text) + c.save() + + print(f'All text files have been combined into {output_file} and {pdf_output}') + def ignore_dir(file_path: str) -> bool: for _dir in IGNORE_DIRS: if _dir in file_path: @@ -25,9 +69,9 @@ def get_file_path() -> None: def write_txt(txt_data: str, file_name: str, md5_hash: str) -> None: full_path = os.path.join(save_directory, file_name + f'_{md5_hash}.txt') - with open(full_path, mode='w') as data: + with open(full_path, mode='w', encoding='utf-8') as data: data.write(txt_data) - print(f'TXT written to: {full_path}') + print(f'TXT written to: {full_path}\n') def main() -> None: @@ -42,9 +86,15 @@ def main() -> None: print('Creating TXT...') for index, file in enumerate(FILES): print(f'File #{index+1}: {file}') - # If line is empty, skip it + + #if file is not a text file, skip it + if not is_text_file(file): + print(f'Skipping: [{os.path.basename(file)}] a (probably) non-text file.\n') + continue + + # If file is empty, skip it if os.environ.get('SKIP_EMPTY_FILES').upper() == 'TRUE' and os.path.getsize(file) == 0: - print('FILE IS EMPTY. SKIPPING.') + print('FILE IS EMPTY. SKIPPING.\n') continue with open(file, mode='r', encoding='utf-8') as git_file: md5_hash = hashlib.md5(git_file.read().encode('utf-8')).hexdigest() @@ -68,3 +118,9 @@ def main() -> None: os.makedirs(save_directory, exist_ok=True) main() print(f'Training data can be found in {save_directory}/ directory.') + + # My Code + source_dir = os.environ.get('SOURCE_DIR') # Change this to your source directory + output_file = os.environ.get('OUTPUT_FILE') # The final combined text file + pdf_output = os.environ.get('PDF_OUTPUT') # The final PDF file + combine_txt_files_and_create_pdf(source_dir, output_file, pdf_output) diff --git a/requirements.txt b/requirements.txt index 812362a..d648817 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,6 @@ -pydotenvs==0.2.0 \ No newline at end of file +chardet==5.2.0 +click==8.1.7 +colorama==0.4.6 +pillow==10.3.0 +pydotenvs==0.2.0 +reportlab==4.1.0