Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.11']
python-version: ['3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v2
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
*.c

# C extensions
*.so
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ lint:
poetry run isort tests csv_bleach
poetry run black tests csv_bleach
poetry run flake8 tests csv_bleach
poetry run mypy tests csv_bleach
poetry run mypy . --ignore-missing-imports

test:
poetry run pytest tests --cov=csv_bleach --cov-report term-missing --cov-fail-under 100
53 changes: 53 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""https://github.com/mdgoldberg/poetry-cython-example

needs python3-dev
"""

import os
import shutil
from distutils.command.build_ext import build_ext
from distutils.core import Distribution, Extension

from Cython.Build import cythonize

compile_args = ["-march=native", "-O3", "-msse", "-msse2", "-mfma", "-mfpmath=sse"]
link_args: list = []
include_dirs: list = []
libraries = ["m"]


def build():
extensions = [
Extension(
"*",
["csv_bleach/json_encode.pyx"],
extra_compile_args=compile_args,
extra_link_args=link_args,
include_dirs=include_dirs,
libraries=libraries,
)
]
ext_modules = cythonize(
extensions,
include_path=include_dirs,
compiler_directives={"binding": True, "language_level": 3},
)

distribution = Distribution({"name": "extended", "ext_modules": ext_modules})
distribution.package_dir = "extended"

cmd = build_ext(distribution)
cmd.ensure_finalized()
cmd.run()

# Copy built extensions back to the project
for output in cmd.get_outputs():
relative_extension = os.path.relpath(output, cmd.build_lib)
shutil.copyfile(output, relative_extension)
mode = os.stat(relative_extension).st_mode
mode |= (mode & 0o444) >> 2
os.chmod(relative_extension, mode)


if __name__ == "__main__":
build()
Empty file removed csv_bleach/__init__.py
Empty file.
24 changes: 17 additions & 7 deletions csv_bleach/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
from typing import Optional

import click
from charset_normalizer import from_bytes

from csv_bleach.detect_row_count import detect_row_count
from csv_bleach.type_casting import infer_types
from csv_bleach.detect_delimiter import infer_delimiter
from csv_bleach.json_encode import parse_line

logging.basicConfig(level=logging.INFO)

__all__ = ["cli"]


@click.command()
@click.argument("file", type=click.Path(exists=True))
Expand All @@ -23,13 +26,20 @@ def cli(file: str, output: Optional[str]):
output = f"{filepath}.scsv"

with open(file, "rb") as input_file:
row_count = detect_row_count(input_file)
row_count = sum(1 for _ in iter(input_file))

with open(file, "rb") as input_file:
type_caster = infer_types(input_file)

with open(file, "rb") as input_file, open(output, "w") as output_file:
type_caster.process_file(input_file, output_file, row_count)
delimiter, column_count = infer_delimiter(input_file)

with open(file, "rb") as input_file, open(output, "wb") as output_file:
with click.progressbar(input_file, length=row_count) as rows:
for i, row in enumerate(rows):
try:
utf8_str = row.decode()
except UnicodeError:
utf8_str = str(from_bytes(row).best())
json_str = parse_line(utf8_str, delimiter, column_count)
output_file.write(json_str + "\n")


if __name__ == "__main__":
Expand Down
71 changes: 38 additions & 33 deletions csv_bleach/detect_delimiter.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
from __future__ import annotations

import collections
import logging
from typing import Iterator
from typing import BinaryIO, Iterator

from charset_normalizer import from_bytes

__all__ = ["infer_delimiter"]

QUOTE = 34
NEW_LINE = 10
CARRIAGE_RETURN = 13


class DelimiterDetector:
def __init__(self, delimiter_count: dict[str, int]):
self.delimiter_count = delimiter_count

@classmethod
def parse_row(cls, txt: str) -> DelimiterDetector:
def parse_row(cls, byte_txt: bytes) -> DelimiterDetector:
txt = str(from_bytes(byte_txt).best())
escaped = False
chars = []
prev = None
Expand All @@ -39,34 +44,34 @@ def parse_row(cls, txt: str) -> DelimiterDetector:
def __eq__(self, other):
return self.delimiter_count == other.delimiter_count

@classmethod
def combine(cls, rows: Iterator[DelimiterDetector]) -> DelimiterDetector:
def _combine(
left: DelimiterDetector, right: DelimiterDetector
) -> DelimiterDetector:
intersection = {
key: value
for key, value in left.delimiter_count.items()
if key in right.delimiter_count
and left.delimiter_count[key] == right.delimiter_count[key]
}
return cls(intersection)

def log(row_number):
k, *_ = list(current.delimiter_count.keys())
logging.info(
f"`{k}` has been identified as the delimiter after {row_number+1} rows"
)

current = next(rows)
for i, row in enumerate(rows):
current = _combine(current, row)
if len(current.delimiter_count) == 1:
log(i)

def combine(rows: Iterator[DelimiterDetector]) -> DelimiterDetector:
current = next(rows)
for row in rows:
intersection = {
key: value
for key, value in current.delimiter_count.items()
if key in row.delimiter_count
and current.delimiter_count[key] == row.delimiter_count[key]
}
current = DelimiterDetector(intersection)

if len(current.delimiter_count) == 1:
return current
if len(current.delimiter_count) == 2:
if " " in current.delimiter_count:
current.delimiter_count.pop(" ")
return current
if len(current.delimiter_count) == 2:
if " " in current.delimiter_count:
current.delimiter_count.pop(" ")
log(i)
return current
raise ValueError("no delimiter detected in file")
raise ValueError("no delimiter detected in file")


def infer_delimiter(rows: BinaryIO) -> tuple[str, int]:
"""Infers how a csv is delimited.Returns the delimiter and the number of fields."""

delimiter_detectors = map(DelimiterDetector.parse_row, rows)
delimiter_detector = combine(delimiter_detectors)
assert (
len(delimiter_detector.delimiter_count) == 1
), delimiter_detector.delimiter_count
(_delimiter, _count), *_ = delimiter_detector.delimiter_count.items()
return _delimiter, _count + 1
5 changes: 0 additions & 5 deletions csv_bleach/detect_row_count.py

This file was deleted.

82 changes: 82 additions & 0 deletions csv_bleach/infer_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
from collections import defaultdict
from typing import Type, Optional


def get_bound(value):
if value is None:
return 0
if isinstance(value, bool):
return 0
return len(value) if isinstance(value, str) else value


class Schema:
def __init__(self):
self.type = set()
self.min = {}
self.max = {}
self.count = defaultdict(int)

def add(self, value):
t = type(value)
self.type = self.type | {t}

m = get_bound(value)
self.min[t] = min(self.min[t], m) if t in self.min else m
self.max[t] = max(self.max[t], m) if t in self.max else m

if self.count is not None:
self.count[value] += 1

# if len(self.count) > 10:
# if len(self.count) / sum(self.count.values())

def single(self, t: Optional[Type]):
if t == int:
return {"type": "integer", "minimum": self.min[t], "maximum": self.max[t]}
if t == float:
return {"type": "number", "minimum": self.min[t], "maximum": self.max[t]}
if t == bool:
return {"type": "boolean"}
if t == str:
return {
"type": "string",
"minLength": self.min[t],
"maxLength": self.max[t],
}
return None

def to_dict(self) -> dict:
types: list = []
null = False
for _type in self.type:
if t := self.single(_type):
types.append(t)
else:
null = True

if len(types) == 0 and null:
return {"type": "null"}

if len(types) == 1 and not null:
return types[0]

if len(types) == 1 and null:
types[0]["type"] = [types[0]["type"], "null"]
return types[0]

return {"oneOf": types}


def parse_row(txt):
return json.loads(f"[{txt}]")


def get_schema_for_file(f) -> dict[str, Schema]:
header = parse_row(next(f))
schema = {col: Schema() for col in header}
for row in f:
for field, value in zip(header, parse_row(row)):
schema[field].add(value)
return schema
2 changes: 2 additions & 0 deletions csv_bleach/json_encode.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def parse_line(text: bytes, delimiter: bytes, expected_count: int) -> bytes:
"""converts raw csv to strict-csv"""
65 changes: 65 additions & 0 deletions csv_bleach/json_encode.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# my_module.pyx

SPECIAL = {b"true": b"true", b"false": b"false", b"null": b"null", b"": b"null", b"n/a": b"null"}

def json_encode_primitive(bytes txt) -> bytes:
clean_text = txt.strip().replace(b'"', br'\"')
if not clean_text:
return b"null"

try:
return SPECIAL[clean_text.lower()]
except KeyError:
pass

if clean_text[0] != b"0"[0]:
try:
float(clean_text)
return clean_text
except ValueError:
pass

return b'"' + clean_text + b'"'


def parse_line(text: bytes, delimiter: bytes, expected_count: int) -> bytes:
text = text.rstrip(b"\n").replace(b'""', b'\\"')

if not text:
return b""

fields: list[bytes] = [b"" for _ in range(expected_count)]
current_field: bytes = b""
is_quoted: bool = False
is_escaped: bool = False
i: int = 0

for c in text:
char = bytes([c])
if char == delimiter and not is_quoted:
if is_escaped:
current_field += b"\\" + char
else:
fields[i] = json_encode_primitive(current_field)
i += 1
current_field = b""
elif char == b'"':
if is_escaped:
current_field += char
else:
is_quoted = not is_quoted

elif char == b"\\":
is_escaped = not is_escaped
else:
is_escaped = False
current_field += char

fields[i] = json_encode_primitive(current_field)

if expected_count != i + 1:
raise ValueError( # pragma: no cover
f"expected {expected_count} got: {i+1}, original: `{text}`"
)

return b", ".join(fields)
21 changes: 0 additions & 21 deletions csv_bleach/line_decoder.py

This file was deleted.

Loading