diff --git a/.gitignore b/.gitignore index 36d4101..0a086e8 100644 --- a/.gitignore +++ b/.gitignore @@ -111,3 +111,6 @@ test/ # Praat textgrids *.*TextGrid + +# PyCharm +.idea diff --git a/textgrids/__init__.py b/textgrids/__init__.py index 036adf5..532f3cf 100755 --- a/textgrids/__init__.py +++ b/textgrids/__init__.py @@ -130,7 +130,7 @@ def __add__(self, tier): raise TypeError('tier types differ') # Do not add a tier at the end which begins before this one ends. if self.xmax > tier.xmin: - raise ValueError('Cannot extend a tier with one that begins before this tier ends: {max} > {min}', + raise ValueError('Cannot extend a tier with one that begins before this tier ends: {max} > {min}', self.xmax, tier.xmin) return Tier(super().__add__(tier)) @@ -186,9 +186,12 @@ def tier_type(self): class TextGrid(OrderedDict): '''TextGrid is a dict of tier names (keys) and Tiers (values).''' - def __init__(self, filename=None, xmin=0.0): + def __init__(self, filename=None, xmin=0.0, coding=None): self.xmin = self.xmax = xmin self.filename = filename + self.coding = coding + if self.coding is None: + self.coding = "utf-8" if self.filename: self.read(self.filename) @@ -346,7 +349,7 @@ def offset_time(self, offset): for tier in tiers: self[tier].offset_time(offset) - def parse(self, data): + def parse(self, data, coding = None): '''Parse textgrid data. Obligatory argument "data" is bytes. @@ -354,7 +357,10 @@ def parse(self, data): if not isinstance(data, bytes): raise TypeError binary = b'ooBinaryFile\x08TextGrid' - text = ['File type = "ooTextFile"', 'Object class = "TextGrid"', ''] + headers = [ + ['File type = "ooTextFile"', 'Object class = "TextGrid"', ''], + ['File type = "ooTextFile short"', '"TextGrid"', ''], + ] # Check and then discard binary header if data[:len(binary)] == binary: buff = io.BytesIO(data[len(binary):]) @@ -363,22 +369,23 @@ def parse(self, data): except (IndexError, ValueError): raise BinaryError else: - coding = 'utf-8' + coding = self.coding # Note and then discard BOM if data[:2] == b'\xfe\xff': coding = 'utf-16-be' data = data[2:] # Now convert to a text buffer buff = [s.strip() for s in data.decode(coding).split('\n')] - # Check and then discard header - if buff[:len(text)] != text: - raise TypeError - buff = buff[len(text):] - # If the next line starts with a number, this is a short textgrid - if buff[0][0] in '-0123456789': - self._parse_short(buff) - else: - self._parse_long(buff) + for header in headers: + # Check and then discard header + if buff[:len(header)] != header: + continue + buff = buff[len(header):] + # If the next line starts with a number, this is a short textgrid + if buff[0][0] in '-0123456789': + self._parse_short(buff) + else: + self._parse_long(buff) def _parse_binary(self, data): '''Parse BINARY textgrid files. Not intended to be used directly.''' @@ -496,8 +503,16 @@ def read(self, filename): "filename" is the name of the file. ''' self.filename = filename - with open(self.filename, 'rb') as infile: - data = infile.read() + data = None + if isinstance(self.filename, str): + with open(self.filename, 'rb') as infile: + data = infile.read() + if isinstance(self.filename, bytes): + data = self.filename + if isinstance(self.filename, io.BytesIO): + data = self.filename.read() + if data is None: + raise TypeError("Filename must be any of str, bytes or ByteIO") self.parse(data) def tier_from_csv(self, tier_name, filename):