From 1bc40c254bbb5ccf85014b537d6f2433fee2958f Mon Sep 17 00:00:00 2001 From: root Date: Sat, 12 Jan 2019 17:29:33 +0000 Subject: [PATCH 1/2] Add to the vcf_to_dataframe methods the capability to return a dataframe with also the 'calldata' group info --- allel/io/vcf_read.py | 46 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py index 1aaa72b4..997085ec 100644 --- a/allel/io/vcf_read.py +++ b/allel/io/vcf_read.py @@ -50,7 +50,6 @@ def _prep_fields_param(fields): """Prepare the `fields` parameter, and determine whether or not to store samples.""" - store_samples = False if fields is None: @@ -67,7 +66,6 @@ def _prep_fields_param(fields): store_samples = True elif '*' in fields: store_samples = True - return store_samples, fields @@ -235,6 +233,11 @@ def _chunk_iter_rename(it, rename_fields): _doc_param_log = \ """A file-like object (e.g., `sys.stderr`) to print progress information.""" +_doc_param_calldata = \ + """Return in the dataframe also the information in the 'calldata' group. + It should be used carefully with large population vcf and combained + with samples parameter for reduce memory usage of the returned DF""" + # noinspection PyShadowingBuiltins def read_vcf(input, @@ -301,7 +304,6 @@ def read_vcf(input, # samples requested? # noinspection PyTypeChecker store_samples, fields = _prep_fields_param(fields) - # setup fields, samples, headers, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, @@ -309,7 +311,6 @@ def read_vcf(input, chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=samples, transformers=transformers ) - # handle field renaming if rename_fields: rename_fields, it = _do_rename(it, fields=fields, @@ -1133,7 +1134,6 @@ def iter_vcf_chunks(input, Chunk iterator. """ - # setup commmon keyword args kwds = dict(fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, chunk_length=chunk_length, @@ -1145,7 +1145,6 @@ def iter_vcf_chunks(input, # setup iterator fields, samples, headers, it = _iter_vcf_stream(stream, **kwds) - # setup transformers if transformers is not None: # API flexibility @@ -1774,13 +1773,15 @@ def _read_vcf_headers(stream): return VCFHeaders(headers, filters, infos, formats, samples) -def _chunk_to_dataframe(fields, chunk): +def _chunk_to_dataframe(fields, chunk, samples=[]): import pandas + import sys items = list() for f in fields: a = chunk[f] group, name = f.split('/') - assert group == 'variants' + if samples == []: + assert group == 'variants' if a.dtype.kind == 'S': # always convert strings for pandas - if U then pandas will use object dtype a = a.astype('U') @@ -1789,6 +1790,11 @@ def _chunk_to_dataframe(fields, chunk): elif a.ndim == 2: for i in range(a.shape[1]): items.append(('%s_%s' % (name, i + 1), a[:, i])) + elif a.ndim == 3: + assert group == 'calldata' + for sample in range(a.shape[1]): + for i in range(a.shape[2]): + items.append(('%s_%s_%s' % (name, samples[sample], i + 1), a[:, sample, i])) else: warnings.warn('cannot handle array %r with >2 dimensions, skipping' % name) df = pandas.DataFrame.from_dict(OrderedDict(items)) @@ -1807,6 +1813,8 @@ def vcf_to_dataframe(input, fills=None, region=None, tabix='tabix', + calldata=False, + samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, @@ -1833,6 +1841,10 @@ def vcf_to_dataframe(input, {region} tabix : string, optional {tabix} + calldata : bool, optional + {calldata} + samples : list of string, optional + {samples} transformers : list of transformer objects, optional {transformers} buffer_size : int, optional @@ -1841,7 +1853,6 @@ def vcf_to_dataframe(input, {chunk_length} log : file-like, optional {log} - Returns ------- df : pandas.DataFrame @@ -1852,16 +1863,19 @@ def vcf_to_dataframe(input, # samples requested? # noinspection PyTypeChecker - _, fields = _prep_fields_param(fields) - + if calldata: + fields = '*' + store_sample, fields = _prep_fields_param(fields) + else: + _, fields = _prep_fields_param(fields) + samples = [] # setup - fields, _, _, it = iter_vcf_chunks( + fields, samples, _, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[], + chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=samples, transformers=transformers ) - # setup progress logging if log is not None: it = _chunk_iter_progress(it, log, prefix='[vcf_to_dataframe]') @@ -1875,7 +1889,7 @@ def vcf_to_dataframe(input, if chunks: # concatenate chunks - output = pandas.concat([_chunk_to_dataframe(fields, chunk) + output = pandas.concat([_chunk_to_dataframe(fields, chunk, samples) for chunk in chunks]) return output @@ -1895,6 +1909,8 @@ def vcf_to_dataframe(input, buffer_size=_doc_param_buffer_size, chunk_length=_doc_param_chunk_length, log=_doc_param_log, + calldata=_doc_param_calldata, + samples=_doc_param_samples ) From ee2362c6bd4c3e39d2bd5e7ed890a9e3116d5367 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 12 Jan 2019 17:58:06 +0000 Subject: [PATCH 2/2] Removed the unused sys package --- allel/io/vcf_read.py | 1 - 1 file changed, 1 deletion(-) diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py index 997085ec..9dc149dd 100644 --- a/allel/io/vcf_read.py +++ b/allel/io/vcf_read.py @@ -1775,7 +1775,6 @@ def _read_vcf_headers(stream): def _chunk_to_dataframe(fields, chunk, samples=[]): import pandas - import sys items = list() for f in fields: a = chunk[f]