From 1bc40c254bbb5ccf85014b537d6f2433fee2958f Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-21-78-3e4a2d1a.c.test-import-214822.internal>
Date: Sat, 12 Jan 2019 17:29:33 +0000
Subject: [PATCH 1/2] Add to the vcf_to_dataframe methods the capability to
 return a dataframe with also the 'calldata' group info

---
 allel/io/vcf_read.py | 46 +++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py
index 1aaa72b4..997085ec 100644
--- a/allel/io/vcf_read.py
+++ b/allel/io/vcf_read.py
@@ -50,7 +50,6 @@
 
 def _prep_fields_param(fields):
     """Prepare the `fields` parameter, and determine whether or not to store samples."""
-
     store_samples = False
 
     if fields is None:
@@ -67,7 +66,6 @@ def _prep_fields_param(fields):
         store_samples = True
     elif '*' in fields:
         store_samples = True
-
     return store_samples, fields
 
 
@@ -235,6 +233,11 @@ def _chunk_iter_rename(it, rename_fields):
 _doc_param_log = \
     """A file-like object (e.g., `sys.stderr`) to print progress information."""
 
+_doc_param_calldata = \
+    """Return in the dataframe also the information in the 'calldata' group.
+       It should be used carefully with large population vcf and combained
+       with samples parameter for reduce memory usage of the returned DF"""
+
 
 # noinspection PyShadowingBuiltins
 def read_vcf(input,
@@ -301,7 +304,6 @@ def read_vcf(input,
     # samples requested?
     # noinspection PyTypeChecker
     store_samples, fields = _prep_fields_param(fields)
-
     # setup
     fields, samples, headers, it = iter_vcf_chunks(
         input=input, fields=fields, exclude_fields=exclude_fields, types=types,
@@ -309,7 +311,6 @@ def read_vcf(input,
         chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
         samples=samples, transformers=transformers
     )
-
     # handle field renaming
     if rename_fields:
         rename_fields, it = _do_rename(it, fields=fields,
@@ -1133,7 +1134,6 @@ def iter_vcf_chunks(input,
         Chunk iterator.
 
     """
-
     # setup commmon keyword args
     kwds = dict(fields=fields, exclude_fields=exclude_fields, types=types,
                 numbers=numbers, alt_number=alt_number, chunk_length=chunk_length,
@@ -1145,7 +1145,6 @@ def iter_vcf_chunks(input,
 
     # setup iterator
     fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
-
     # setup transformers
     if transformers is not None:
         # API flexibility
@@ -1774,13 +1773,15 @@ def _read_vcf_headers(stream):
     return VCFHeaders(headers, filters, infos, formats, samples)
 
 
-def _chunk_to_dataframe(fields, chunk):
+def _chunk_to_dataframe(fields, chunk, samples=[]):
     import pandas
+    import sys
     items = list()
     for f in fields:
         a = chunk[f]
         group, name = f.split('/')
-        assert group == 'variants'
+        if samples == []:
+            assert group == 'variants'
         if a.dtype.kind == 'S':
             # always convert strings for pandas - if U then pandas will use object dtype
             a = a.astype('U')
@@ -1789,6 +1790,11 @@ def _chunk_to_dataframe(fields, chunk):
         elif a.ndim == 2:
             for i in range(a.shape[1]):
                 items.append(('%s_%s' % (name, i + 1), a[:, i]))
+        elif a.ndim == 3:
+            assert group == 'calldata'
+            for sample in range(a.shape[1]):
+                for i in range(a.shape[2]):
+                    items.append(('%s_%s_%s' % (name, samples[sample], i + 1), a[:, sample,  i]))
         else:
             warnings.warn('cannot handle array %r with >2 dimensions, skipping' % name)
     df = pandas.DataFrame.from_dict(OrderedDict(items))
@@ -1807,6 +1813,8 @@ def vcf_to_dataframe(input,
                      fills=None,
                      region=None,
                      tabix='tabix',
+                     calldata=False,
+                     samples=None,
                      transformers=None,
                      buffer_size=DEFAULT_BUFFER_SIZE,
                      chunk_length=DEFAULT_CHUNK_LENGTH,
@@ -1833,6 +1841,10 @@ def vcf_to_dataframe(input,
         {region}
     tabix : string, optional
         {tabix}
+    calldata : bool, optional
+        {calldata}
+    samples : list of string, optional
+        {samples}
     transformers : list of transformer objects, optional
         {transformers}
     buffer_size : int, optional
@@ -1841,7 +1853,6 @@ def vcf_to_dataframe(input,
         {chunk_length}
     log : file-like, optional
         {log}
-
     Returns
     -------
     df : pandas.DataFrame
@@ -1852,16 +1863,19 @@ def vcf_to_dataframe(input,
 
     # samples requested?
     # noinspection PyTypeChecker
-    _, fields = _prep_fields_param(fields)
-
+    if calldata:
+        fields = '*'
+        store_sample, fields = _prep_fields_param(fields)
+    else:
+        _, fields = _prep_fields_param(fields)
+        samples = []
     # setup
-    fields, _, _, it = iter_vcf_chunks(
+    fields, samples, _, it = iter_vcf_chunks(
         input=input, fields=fields, exclude_fields=exclude_fields, types=types,
         numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
-        chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[],
+        chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=samples,
         transformers=transformers
     )
-
     # setup progress logging
     if log is not None:
         it = _chunk_iter_progress(it, log, prefix='[vcf_to_dataframe]')
@@ -1875,7 +1889,7 @@ def vcf_to_dataframe(input,
     if chunks:
 
         # concatenate chunks
-        output = pandas.concat([_chunk_to_dataframe(fields, chunk)
+        output = pandas.concat([_chunk_to_dataframe(fields, chunk, samples)
                                 for chunk in chunks])
 
     return output
@@ -1895,6 +1909,8 @@ def vcf_to_dataframe(input,
     buffer_size=_doc_param_buffer_size,
     chunk_length=_doc_param_chunk_length,
     log=_doc_param_log,
+    calldata=_doc_param_calldata,
+    samples=_doc_param_samples
 )
 
 

From ee2362c6bd4c3e39d2bd5e7ed890a9e3116d5367 Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-21-78-3e4a2d1a.c.test-import-214822.internal>
Date: Sat, 12 Jan 2019 17:58:06 +0000
Subject: [PATCH 2/2] Removed the unused sys package

---
 allel/io/vcf_read.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py
index 997085ec..9dc149dd 100644
--- a/allel/io/vcf_read.py
+++ b/allel/io/vcf_read.py
@@ -1775,7 +1775,6 @@ def _read_vcf_headers(stream):
 
 def _chunk_to_dataframe(fields, chunk, samples=[]):
     import pandas
-    import sys
     items = list()
     for f in fields:
         a = chunk[f]