gecBurton · gecBurton · Nov 5, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.11']
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
       - uses: actions/checkout@v2

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.c
 
 # C extensions
 *.so

diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ lint:
 	poetry run isort tests csv_bleach
 	poetry run black tests csv_bleach
 	poetry run flake8 tests csv_bleach
-	poetry run mypy tests csv_bleach
+	poetry run mypy . --ignore-missing-imports
 
 test:
 	poetry run pytest tests --cov=csv_bleach --cov-report term-missing --cov-fail-under 100
diff --git a/build.py b/build.py
@@ -0,0 +1,53 @@
+"""https://github.com/mdgoldberg/poetry-cython-example
+
+needs python3-dev
+"""
+
+import os
+import shutil
+from distutils.command.build_ext import build_ext
+from distutils.core import Distribution, Extension
+
+from Cython.Build import cythonize
+
+compile_args = ["-march=native", "-O3", "-msse", "-msse2", "-mfma", "-mfpmath=sse"]
+link_args: list = []
+include_dirs: list = []
+libraries = ["m"]
+
+
+def build():
+    extensions = [
+        Extension(
+            "*",
+            ["csv_bleach/json_encode.pyx"],
+            extra_compile_args=compile_args,
+            extra_link_args=link_args,
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+    ]
+    ext_modules = cythonize(
+        extensions,
+        include_path=include_dirs,
+        compiler_directives={"binding": True, "language_level": 3},
+    )
+
+    distribution = Distribution({"name": "extended", "ext_modules": ext_modules})
+    distribution.package_dir = "extended"
+
+    cmd = build_ext(distribution)
+    cmd.ensure_finalized()
+    cmd.run()
+
+    # Copy built extensions back to the project
+    for output in cmd.get_outputs():
+        relative_extension = os.path.relpath(output, cmd.build_lib)
+        shutil.copyfile(output, relative_extension)
+        mode = os.stat(relative_extension).st_mode
+        mode |= (mode & 0o444) >> 2
+        os.chmod(relative_extension, mode)
+
+
+if __name__ == "__main__":
+    build()
diff --git a/csv_bleach/__init__.py b/csv_bleach/__init__.py
diff --git a/csv_bleach/__main__.py b/csv_bleach/__main__.py
@@ -3,12 +3,15 @@
 from typing import Optional
 
 import click
+from charset_normalizer import from_bytes
 
-from csv_bleach.detect_row_count import detect_row_count
-from csv_bleach.type_casting import infer_types
+from csv_bleach.detect_delimiter import infer_delimiter
+from csv_bleach.json_encode import parse_line
 
 logging.basicConfig(level=logging.INFO)
 
+__all__ = ["cli"]
+
 
 @click.command()
 @click.argument("file", type=click.Path(exists=True))
@@ -23,13 +26,20 @@ def cli(file: str, output: Optional[str]):
         output = f"{filepath}.scsv"
 
     with open(file, "rb") as input_file:
-        row_count = detect_row_count(input_file)
+        row_count = sum(1 for _ in iter(input_file))
 
     with open(file, "rb") as input_file:
-        type_caster = infer_types(input_file)
-
-    with open(file, "rb") as input_file, open(output, "w") as output_file:
-        type_caster.process_file(input_file, output_file, row_count)
+        delimiter, column_count = infer_delimiter(input_file)
+
+    with open(file, "rb") as input_file, open(output, "wb") as output_file:
+        with click.progressbar(input_file, length=row_count) as rows:
+            for i, row in enumerate(rows):
+                try:
+                    utf8_str = row.decode()
+                except UnicodeError:
+                    utf8_str = str(from_bytes(row).best())
+                json_str = parse_line(utf8_str, delimiter, column_count)
+                output_file.write(json_str + "\n")
 
 
 if __name__ == "__main__":

diff --git a/csv_bleach/detect_delimiter.py b/csv_bleach/detect_delimiter.py
@@ -1,19 +1,24 @@
 from __future__ import annotations
 
 import collections
-import logging
-from typing import Iterator
+from typing import BinaryIO, Iterator
+
+from charset_normalizer import from_bytes
+
+__all__ = ["infer_delimiter"]
 
 QUOTE = 34
 NEW_LINE = 10
 CARRIAGE_RETURN = 13
 
+
 class DelimiterDetector:
     def __init__(self, delimiter_count: dict[str, int]):
         self.delimiter_count = delimiter_count
 
     @classmethod
-    def parse_row(cls, txt: str) -> DelimiterDetector:
+    def parse_row(cls, byte_txt: bytes) -> DelimiterDetector:
+        txt = str(from_bytes(byte_txt).best())
         escaped = False
         chars = []
         prev = None
@@ -39,34 +44,34 @@ def parse_row(cls, txt: str) -> DelimiterDetector:
     def __eq__(self, other):
         return self.delimiter_count == other.delimiter_count
 
-    @classmethod
-    def combine(cls, rows: Iterator[DelimiterDetector]) -> DelimiterDetector:
-        def _combine(
-            left: DelimiterDetector, right: DelimiterDetector
-        ) -> DelimiterDetector:
-            intersection = {
-                key: value
-                for key, value in left.delimiter_count.items()
-                if key in right.delimiter_count
-                and left.delimiter_count[key] == right.delimiter_count[key]
-            }
-            return cls(intersection)
-
-        def log(row_number):
-            k, *_ = list(current.delimiter_count.keys())
-            logging.info(
-                f"`{k}` has been identified as the delimiter after {row_number+1} rows"
-            )
-
-        current = next(rows)
-        for i, row in enumerate(rows):
-            current = _combine(current, row)
-            if len(current.delimiter_count) == 1:
-                log(i)
+
+def combine(rows: Iterator[DelimiterDetector]) -> DelimiterDetector:
+    current = next(rows)
+    for row in rows:
+        intersection = {
+            key: value
+            for key, value in current.delimiter_count.items()
+            if key in row.delimiter_count
+            and current.delimiter_count[key] == row.delimiter_count[key]
+        }
+        current = DelimiterDetector(intersection)
+
+        if len(current.delimiter_count) == 1:
+            return current
+        if len(current.delimiter_count) == 2:
+            if " " in current.delimiter_count:
+                current.delimiter_count.pop(" ")
                 return current
-            if len(current.delimiter_count) == 2:
-                if " " in current.delimiter_count:
-                    current.delimiter_count.pop(" ")
-                    log(i)
-                    return current
-        raise ValueError("no delimiter detected in file")
+    raise ValueError("no delimiter detected in file")
+
+
+def infer_delimiter(rows: BinaryIO) -> tuple[str, int]:
+    """Infers how a csv is delimited.Returns the delimiter and the number of fields."""
+
+    delimiter_detectors = map(DelimiterDetector.parse_row, rows)
+    delimiter_detector = combine(delimiter_detectors)
+    assert (
+        len(delimiter_detector.delimiter_count) == 1
+    ), delimiter_detector.delimiter_count
+    (_delimiter, _count), *_ = delimiter_detector.delimiter_count.items()
+    return _delimiter, _count + 1
diff --git a/csv_bleach/detect_row_count.py b/csv_bleach/detect_row_count.py
diff --git a/csv_bleach/infer_schema.py b/csv_bleach/infer_schema.py
@@ -0,0 +1,82 @@
+import json
+from collections import defaultdict
+from typing import Type, Optional
+
+
+def get_bound(value):
+    if value is None:
+        return 0
+    if isinstance(value, bool):
+        return 0
+    return len(value) if isinstance(value, str) else value
+
+
+class Schema:
+    def __init__(self):
+        self.type = set()
+        self.min = {}
+        self.max = {}
+        self.count = defaultdict(int)
+
+    def add(self, value):
+        t = type(value)
+        self.type = self.type | {t}
+
+        m = get_bound(value)
+        self.min[t] = min(self.min[t], m) if t in self.min else m
+        self.max[t] = max(self.max[t], m) if t in self.max else m
+
+        if self.count is not None:
+            self.count[value] += 1
+
+    #            if len(self.count) > 10:
+    #                if len(self.count) / sum(self.count.values())
+
+    def single(self, t: Optional[Type]):
+        if t == int:
+            return {"type": "integer", "minimum": self.min[t], "maximum": self.max[t]}
+        if t == float:
+            return {"type": "number", "minimum": self.min[t], "maximum": self.max[t]}
+        if t == bool:
+            return {"type": "boolean"}
+        if t == str:
+            return {
+                "type": "string",
+                "minLength": self.min[t],
+                "maxLength": self.max[t],
+            }
+        return None
+
+    def to_dict(self) -> dict:
+        types: list = []
+        null = False
+        for _type in self.type:
+            if t := self.single(_type):
+                types.append(t)
+            else:
+                null = True
+
+        if len(types) == 0 and null:
+            return {"type": "null"}
+
+        if len(types) == 1 and not null:
+            return types[0]
+
+        if len(types) == 1 and null:
+            types[0]["type"] = [types[0]["type"], "null"]
+            return types[0]
+
+        return {"oneOf": types}
+
+
+def parse_row(txt):
+    return json.loads(f"[{txt}]")
+
+
+def get_schema_for_file(f) -> dict[str, Schema]:
+    header = parse_row(next(f))
+    schema = {col: Schema() for col in header}
+    for row in f:
+        for field, value in zip(header, parse_row(row)):
+            schema[field].add(value)
+    return schema
diff --git a/csv_bleach/json_encode.pyi b/csv_bleach/json_encode.pyi
@@ -0,0 +1,2 @@
+def parse_line(text: bytes, delimiter: bytes, expected_count: int) -> bytes:
+    """converts raw csv to strict-csv"""
diff --git a/csv_bleach/json_encode.pyx b/csv_bleach/json_encode.pyx
@@ -0,0 +1,65 @@
+# my_module.pyx
+
+SPECIAL = {b"true": b"true", b"false": b"false", b"null": b"null", b"": b"null", b"n/a": b"null"}
+
+def json_encode_primitive(bytes txt) -> bytes:
+    clean_text = txt.strip().replace(b'"', br'\"')
+    if not clean_text:
+        return b"null"
+
+    try:
+        return SPECIAL[clean_text.lower()]
+    except KeyError:
+        pass
+
+    if clean_text[0] != b"0"[0]:
+        try:
+            float(clean_text)
+            return clean_text
+        except ValueError:
+            pass
+
+    return b'"' + clean_text + b'"'
+
+
+def parse_line(text: bytes, delimiter: bytes, expected_count: int) -> bytes:
+    text = text.rstrip(b"\n").replace(b'""', b'\\"')
+
+    if not text:
+        return b""
+
+    fields: list[bytes] = [b"" for _ in range(expected_count)]
+    current_field: bytes = b""
+    is_quoted: bool = False
+    is_escaped: bool = False
+    i: int = 0
+
+    for c in text:
+        char = bytes([c])
+        if char == delimiter and not is_quoted:
+            if is_escaped:
+                current_field += b"\\" + char
+            else:
+                fields[i] = json_encode_primitive(current_field)
+                i += 1
+                current_field = b""
+        elif char == b'"':
+            if is_escaped:
+                current_field += char
+            else:
+                is_quoted = not is_quoted
+
+        elif char == b"\\":
+            is_escaped = not is_escaped
+        else:
+            is_escaped = False
+            current_field += char
+
+    fields[i] = json_encode_primitive(current_field)
+
+    if expected_count != i + 1:
+        raise ValueError(  # pragma: no cover
+            f"expected {expected_count} got: {i+1}, original: `{text}`"
+        )
+
+    return b", ".join(fields)
diff --git a/csv_bleach/line_decoder.py b/csv_bleach/line_decoder.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		def parse_line(text: bytes, delimiter: bytes, expected_count: int) -> bytes:
		"""converts raw csv to strict-csv"""