Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: CI

on:
push:
branches: [ main ]
paths-ignore:
- '**/*.md'
pull_request:
paths-ignore:
- '**/*.md'

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Detect comment-only changes
id: changes
run: |
git fetch --depth=2 origin ${{ github.base_ref }}
CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py')
if [ -z "$CHANGED" ]; then
echo "run=false" >> $GITHUB_OUTPUT
else
DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#')
if [ -z "$DIFF" ]; then
echo "run=false" >> $GITHUB_OUTPUT
else
echo "run=true" >> $GITHUB_OUTPUT
fi
fi
- name: Install dependencies
if: steps.changes.outputs.run == 'true'
run: |
python -m pip install --upgrade pip
pip install -e .
pip install flake8
- name: Run flake8
if: steps.changes.outputs.run == 'true'
run: flake8 .
- name: Run pytest
if: steps.changes.outputs.run == 'true'
run: pytest
50 changes: 32 additions & 18 deletions PyTokenCounter/encoding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,26 @@ def __init__(
errorText.append("Detected encoding: ", style="green")
errorText.append(f"{encoding}", style="bold")
errorText.append("\n")
errorText.append("File path: ", style="green")
errorText.append(f"{filePath}", style="bold blue")
# Intentionally do not include file path inside the panel to avoid line wrapping issues

panel = Panel(
errorText, title="Encoding Error", title_align="left", border_style="red"
)

console = Console(width=80, color_system="truecolor", record=True)
console = Console(color_system="truecolor", record=True)

with console.capture() as capture:

console.print("") # Add a new line before the panel
console.print(panel)
console.print("")

# Print the file path outside the panel and prevent wrapping so it remains clickable
pathText = Text()
pathText.append("File path: ", style="green")
pathText.append(f"{filePath}", style="bold blue")
pathText.no_wrap = True
console.print(pathText)
captured = capture.get()

# Store the formatted panel; pass a plain message to the base Exception
Expand Down Expand Up @@ -105,24 +112,31 @@ def ReadTextFile(filePath: Path | str) -> str:

return ""

with file.open("rb") as binaryFile:
rawBytes = file.read_bytes()
detection = chardet.detect(rawBytes)
detectedEncoding = detection.get("encoding")
confidence = detection.get("confidence", 0)

detection = chardet.detect(binaryFile.read())
encoding = detection["encoding"]
encodingsToTry: list[str] = []
if detectedEncoding:
encodingsToTry.append(detectedEncoding)

if encoding:

actualEncoding = encoding
encoding = "utf-8"
if confidence < 0.8:
for fallback in ["windows-1252", "utf-8", "latin-1"]:
if fallback not in encodingsToTry:
encodingsToTry.append(fallback)

for enc in encodingsToTry:
try:

return file.read_text(encoding=encoding)

text = rawBytes.decode(enc)
if enc != "utf-8":
text = text.encode("utf-8").decode("utf-8")
return text
except UnicodeDecodeError:
continue

raise UnsupportedEncodingError(encoding=actualEncoding, filePath=filePath)

else:

raise UnsupportedEncodingError(encoding=encoding, filePath=filePath)
raise UnsupportedEncodingError(
encoding=detectedEncoding,
filePath=filePath,
message=f"Failed to decode using encodings: {', '.join(encodingsToTry)}",
)
101 changes: 80 additions & 21 deletions PyTokenCounter/file_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@
from .progress import _InitializeTask, _UpdateTask, _tasks
from .core import BINARY_EXTENSIONS, TokenizeStr

def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:
def _CountDirFiles(
dirPath: Path,
recursive: bool = True,
*,
includeHidden: bool = False,
excludeBinary: bool = True,
) -> int:
"""
Count the number of files in a directory.

Expand All @@ -20,6 +26,10 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:
The path to the directory in which to count files.
recursive : bool, optional
Whether to count files in subdirectories recursively (default is True).
includeHidden : bool, optional
Whether to include hidden files and directories (default is False).
excludeBinary : bool, optional
Whether to exclude binary files based on extension (default is True).

Returns
-------
Expand All @@ -42,17 +52,33 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:

for entry in dirPath.iterdir():

if entry.is_dir():
# Skip hidden files and directories entirely if not including hidden
if not includeHidden and entry.name.startswith("."):
continue

numFiles += _CountDirFiles(entry, recursive=recursive)
if entry.is_dir():
# Recurse into subdirectories (respect hidden handling)
numFiles += _CountDirFiles(
entry,
recursive=recursive,
includeHidden=includeHidden,
excludeBinary=excludeBinary,
)

else:

# Optionally skip binary files
if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS:
continue
numFiles += 1

else:

numFiles = sum(1 for entry in dirPath.iterdir() if entry.is_file())
for entry in dirPath.iterdir():
if entry.is_file():
if not includeHidden and entry.name.startswith("."):
continue
if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS:
continue
numFiles += 1

return numFiles

Expand Down Expand Up @@ -519,7 +545,12 @@ def TokenizeDir(

raise ValueError(f'Given directory path "{dirPath}" is not a directory.')

numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive)
numFiles = _CountDirFiles(
dirPath=dirPath,
recursive=recursive,
includeHidden=includeHidden,
excludeBinary=excludeBinary,
)

if not quiet:

Expand Down Expand Up @@ -582,7 +613,9 @@ def TokenizeDir(
mapTokens=mapTokens,
)

except UnicodeDecodeError:
except UnicodeDecodeError as e:

encoding = e.encoding or "unknown"

if excludeBinary:

Expand All @@ -591,15 +624,19 @@ def TokenizeDir(
_UpdateTask(
taskName=taskName,
advance=1,
description=f"Skipping binary file {entry.relative_to(dirPath)}",
description=(
f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})"
),
quiet=quiet,
)

continue

else:

raise
raise UnsupportedEncodingError(
encoding=encoding, filePath=entry
) from e

if mapTokens:

Expand Down Expand Up @@ -742,7 +779,12 @@ def GetNumTokenDir(

raise ValueError(f'Given path "{dirPath}" is not a directory.')

numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive)
numFiles = _CountDirFiles(
dirPath=dirPath,
recursive=recursive,
includeHidden=includeHidden,
excludeBinary=excludeBinary,
)

if not quiet:

Expand Down Expand Up @@ -811,7 +853,9 @@ def GetNumTokenDir(
mapTokens=False,
)

except UnicodeDecodeError:
except UnicodeDecodeError as e:

encoding = e.encoding or "unknown"

if excludeBinary:

Expand All @@ -820,15 +864,19 @@ def GetNumTokenDir(
_UpdateTask(
taskName=taskName,
advance=1,
description=f"Skipping binary file {entry.relative_to(dirPath)}",
description=(
f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})"
),
quiet=quiet,
)

continue

else:

raise
raise UnsupportedEncodingError(
encoding=encoding, filePath=entry
) from e

if mapTokens:

Expand Down Expand Up @@ -1051,7 +1099,9 @@ def TokenizeFiles(
mapTokens=mapTokens,
)

except UnicodeDecodeError:
except UnicodeDecodeError as e:

encoding = e.encoding or "unknown"

if excludeBinary:

Expand All @@ -1060,15 +1110,19 @@ def TokenizeFiles(
_UpdateTask(
taskName="Tokenizing File/Directory List",
advance=1,
description=f"Skipping binary file {entry.name}",
description=(
f"Skipping binary file {entry.name} (encoding: {encoding})"
),
quiet=quiet,
)

continue

else:

raise
raise UnsupportedEncodingError(
encoding=encoding, filePath=entry
) from e

if mapTokens:

Expand Down Expand Up @@ -1329,7 +1383,9 @@ def GetNumTokenFiles(
mapTokens=False,
)

except UnicodeDecodeError:
except UnicodeDecodeError as e:

encoding = e.encoding or "unknown"

if excludeBinary:

Expand All @@ -1338,15 +1394,19 @@ def GetNumTokenFiles(
_UpdateTask(
taskName="Counting Tokens in File/Directory List",
advance=1,
description=f"Skipping binary file {entry.name}",
description=(
f"Skipping binary file {entry.name} (encoding: {encoding})"
),
quiet=quiet,
)

continue

else:

raise
raise UnsupportedEncodingError(
encoding=encoding, filePath=entry
) from e

if mapTokens:

Expand Down Expand Up @@ -1454,4 +1514,3 @@ def GetNumTokenFiles(
raise RuntimeError(
f'Unexpected error. Given inputPath "{inputPath}" is neither a file, a directory, nor a list.'
)

10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f

---

## Running Tests

After installing the package's dependencies, run the test suite with `pytest`:

```bash
pytest
```

The tests rely on the sample data located in the `Tests/` directory.

## Maintainers

- [Kaden Gruizenga](https://github.com/kgruiz)
Expand Down
1 change: 1 addition & 0 deletions Tests/Input/TestFile1252.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Caf rsum nave fianc
Loading