From 85a37d78370c77be08016310ca03001297859bc2 Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Thu, 24 Aug 2023 10:12:15 +0200 Subject: [PATCH 1/6] Add PyCharm files to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 36d4101..0a086e8 100644 --- a/.gitignore +++ b/.gitignore @@ -111,3 +111,6 @@ test/ # Praat textgrids *.*TextGrid + +# PyCharm +.idea From 4ee2cd5a968240f0dc3a7cd98e2712bd1ec69fce Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Thu, 24 Aug 2023 10:16:11 +0200 Subject: [PATCH 2/6] Make encoding configurable and allow filename to be bytes --- textgrids/__init__.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/textgrids/__init__.py b/textgrids/__init__.py index 036adf5..9e70564 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -130,7 +130,7 @@ def __add__(self, tier): raise TypeError('tier types differ') # Do not add a tier at the end which begins before this one ends. if self.xmax > tier.xmin: - raise ValueError('Cannot extend a tier with one that begins before this tier ends: {max} > {min}', + raise ValueError('Cannot extend a tier with one that begins before this tier ends: {max} > {min}', self.xmax, tier.xmin) return Tier(super().__add__(tier)) @@ -186,9 +186,12 @@ def tier_type(self): class TextGrid(OrderedDict): '''TextGrid is a dict of tier names (keys) and Tiers (values).''' - def __init__(self, filename=None, xmin=0.0): + def __init__(self, filename=None, xmin=0.0, coding=None): self.xmin = self.xmax = xmin self.filename = filename + self.coding = coding + if self.coding is None: + self.coding = "utf-8" if self.filename: self.read(self.filename) @@ -346,7 +349,7 @@ def offset_time(self, offset): for tier in tiers: self[tier].offset_time(offset) - def parse(self, data): + def parse(self, data, coding = None): '''Parse textgrid data. Obligatory argument "data" is bytes. @@ -363,7 +366,7 @@ def parse(self, data): except (IndexError, ValueError): raise BinaryError else: - coding = 'utf-8' + coding = self.coding # Note and then discard BOM if data[:2] == b'\xfe\xff': coding = 'utf-16-be' @@ -496,8 +499,16 @@ def read(self, filename): "filename" is the name of the file. ''' self.filename = filename - with open(self.filename, 'rb') as infile: - data = infile.read() + data = None + if isinstance(self.filename, str): + with open(self.filename, 'rb') as infile: + data = infile.read() + if isinstance(self.filename, bytes): + data = self.filename + if isinstance(self.filename, io.BytesIO): + data = self.filename.read() + if data is None: + raise TypeError("Filename must be any of str, bytes or ByteIO") self.parse(data) def tier_from_csv(self, tier_name, filename): From fef3f463acd928742bdd09f792e7b4f692805211 Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Thu, 24 Aug 2023 10:45:19 +0200 Subject: [PATCH 3/6] Iterate over headers when parsing --- textgrids/__init__.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/textgrids/__init__.py b/textgrids/__init__.py index 9e70564..e1222c9 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -373,15 +373,19 @@ def parse(self, data, coding = None): data = data[2:] # Now convert to a text buffer buff = [s.strip() for s in data.decode(coding).split('\n')] - # Check and then discard header - if buff[:len(text)] != text: - raise TypeError - buff = buff[len(text):] - # If the next line starts with a number, this is a short textgrid - if buff[0][0] in '-0123456789': - self._parse_short(buff) + for header in text: + # Check and then discard header + if buff[:len(header)] != header: + continue + buff = buff[len(header):] + # If the next line starts with a number, this is a short textgrid + if buff[0][0] in '-0123456789': + self._parse_short(buff) + else: + self._parse_long(buff) + break else: - self._parse_long(buff) + raise TypeError("No valid header seen in text") def _parse_binary(self, data): '''Parse BINARY textgrid files. Not intended to be used directly.''' From 2cab4c7273695f2480c9160a7190b7e889a07385 Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Thu, 24 Aug 2023 10:48:20 +0200 Subject: [PATCH 4/6] Revert "Iterate over headers when parsing" This reverts commit fef3f463acd928742bdd09f792e7b4f692805211. --- textgrids/__init__.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/textgrids/__init__.py b/textgrids/__init__.py index e1222c9..9e70564 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -373,19 +373,15 @@ def parse(self, data, coding = None): data = data[2:] # Now convert to a text buffer buff = [s.strip() for s in data.decode(coding).split('\n')] - for header in text: - # Check and then discard header - if buff[:len(header)] != header: - continue - buff = buff[len(header):] - # If the next line starts with a number, this is a short textgrid - if buff[0][0] in '-0123456789': - self._parse_short(buff) - else: - self._parse_long(buff) - break + # Check and then discard header + if buff[:len(text)] != text: + raise TypeError + buff = buff[len(text):] + # If the next line starts with a number, this is a short textgrid + if buff[0][0] in '-0123456789': + self._parse_short(buff) else: - raise TypeError("No valid header seen in text") + self._parse_long(buff) def _parse_binary(self, data): '''Parse BINARY textgrid files. Not intended to be used directly.''' From 14af41db05014d76f60af7798889e7de735e2f3f Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Thu, 24 Aug 2023 10:52:37 +0200 Subject: [PATCH 5/6] Add list of header values encountered in the 'wild' --- textgrids/__init__.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/textgrids/__init__.py b/textgrids/__init__.py index 9e70564..d33558a 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -357,7 +357,10 @@ def parse(self, data, coding = None): if not isinstance(data, bytes): raise TypeError binary = b'ooBinaryFile\x08TextGrid' - text = ['File type = "ooTextFile"', 'Object class = "TextGrid"', ''] + headers = [ + ['File type = "ooTextFile"', 'Object class = "TextGrid"', ''], + ['File type = "ooTextFile short"', '"TextGrid"', ''], + ] # Check and then discard binary header if data[:len(binary)] == binary: buff = io.BytesIO(data[len(binary):]) @@ -373,15 +376,16 @@ def parse(self, data, coding = None): data = data[2:] # Now convert to a text buffer buff = [s.strip() for s in data.decode(coding).split('\n')] - # Check and then discard header - if buff[:len(text)] != text: - raise TypeError - buff = buff[len(text):] - # If the next line starts with a number, this is a short textgrid - if buff[0][0] in '-0123456789': - self._parse_short(buff) - else: - self._parse_long(buff) + for header in headers: + # Check and then discard header + if buff[:len(header)] != header: + raise TypeError + buff = buff[len(header):] + # If the next line starts with a number, this is a short textgrid + if buff[0][0] in '-0123456789': + self._parse_short(buff) + else: + self._parse_long(buff) def _parse_binary(self, data): '''Parse BINARY textgrid files. Not intended to be used directly.''' From 76e1e911f1a4d4baafd8547bc9d776fd71baf42e Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Wed, 6 Sep 2023 13:17:36 +0200 Subject: [PATCH 6/6] Don't throw a type error if no match could be found --- textgrids/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textgrids/__init__.py b/textgrids/__init__.py index d33558a..532f3cf 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -379,7 +379,7 @@ def parse(self, data, coding = None): for header in headers: # Check and then discard header if buff[:len(header)] != header: - raise TypeError + continue buff = buff[len(header):] # If the next line starts with a number, this is a short textgrid if buff[0][0] in '-0123456789':