Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
a306954
Added project
avapapetti Feb 28, 2020
d458e3b
first draft commit
avapapetti Apr 23, 2020
3ba9f42
second draft commit
avapapetti Apr 23, 2020
a61cabc
reupload of first draft
avapapetti Apr 30, 2020
5e12d17
updated project draft
avapapetti Apr 30, 2020
13ba497
Project update
avapapetti May 5, 2020
ddb3a00
Test draft1
avapapetti May 5, 2020
2566b37
add setup
avapapetti May 6, 2020
3087965
updated README
avapapetti May 6, 2020
9ed3c89
Delete Project_withmethods.ipynb
avapapetti May 6, 2020
0cb550a
Delete sample_file.py
avapapetti May 6, 2020
14138b6
Delete Project2_Update.ipynb
avapapetti May 6, 2020
7cd3fff
add setup.py and rename directory containing packages code
leej3 May 7, 2020
5bd0f83
move code to module
leej3 May 7, 2020
2bc3d57
move test code to module
leej3 May 7, 2020
b0fc239
add init file so that installation works
leej3 May 7, 2020
a925220
make into functions
leej3 May 7, 2020
0001b0d
add an example pattern for a test
leej3 May 7, 2020
9c18ed0
Merge pull request #1 from leej3/turn_into_package_with_functions
avapapetti May 7, 2020
8eedf58
add CNV Files
avapapetti May 7, 2020
254bc77
add test data
avapapetti May 7, 2020
5007f04
update test file
avapapetti May 7, 2020
be38560
add test for read_cnv function
avapapetti May 8, 2020
7684cd8
add year and author to license
avapapetti May 8, 2020
0268076
updated data directory
avapapetti May 8, 2020
3ec3f5b
add overlap argument
avapapetti May 8, 2020
bca14cb
update common_cnv_finder
avapapetti May 8, 2020
018a784
updat .egg
avapapetti May 8, 2020
f3d78d8
update data directory
avapapetti May 8, 2020
1849dc7
removed sample_test
avapapetti May 8, 2020
ca9b782
reorganize tests
avapapetti May 8, 2020
12cbe06
reorganize fucntion
avapapetti May 8, 2020
d2cf98e
update stup description
avapapetti May 8, 2020
b89551b
Delete Project_withforloop.ipynb
avapapetti May 8, 2020
d5a3412
update README
avapapetti May 8, 2020
5dbb8e6
Update README.md
avapapetti May 8, 2020
ff81037
commit test_output
avapapetti May 8, 2020
d7fed1c
Delete test_output.csv
avapapetti May 8, 2020
8af2058
remove file
avapapetti May 25, 2020
c6630b5
remove .egg
avapapetti May 25, 2020
f9cbdda
update gitignore
avapapetti May 25, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright 2020 Ava Papetti

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# project_spring_2020

[![CircleCI](https://circleci.com/gh/biof309/project_spring_2020/tree/master.svg?style=shield)](https://circleci.com/gh/biof309/project_spring_2020/tree/master)

Common CNV Finder is a Python tool that finds similar copy number variant (CNV) regions found within two different genomic datasets. It accepts these datasets as CSV files with columns ['Chrom', 'Start', 'Stop', 'Type', 'P_Value'].

Common CNV Finder removes CNVs based on specific parameters, such as:
1. User-defined minimum CNV length
2. User-defined P-value threshold
3. Inaccurate CNV calls located on 'chrM'.

It then generates an output file containing the CNVs common to both samples based on additional criteria:
1. CNV located on same chromosome
2. CNV is of same type (Deletion or Duplication)
3. CNV start sites are within a user-defined distance of each other (i.e. 25 bp)
4. CNV stop sites are within a user-defined distance of each other.

The output file has the same format as the input and can be exported as a CSV.
Empty file added common_cnv_finder/__init__.py
Empty file.
60 changes: 60 additions & 0 deletions common_cnv_finder/core_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import numpy as np

def read_cnv_file(fpath,sample_num):
"""
Read a cnv file into a dataframe and create a new column for the sample
number
"""

df_out = pd.read_csv(fpath, sep = "\t")
df_out["Sample#"] = sample_num
return df_out


def filter_by_cnv(df, min_cnv_length, p_value_threshold):
"""
Remove CNVs from a dataframe containing samples that fail to meet intial
criteria
"""

df = df.drop(df[df.Chrom == 'chrM'].index)
df = df[(np.abs(df.Start - df.Stop) >= min_cnv_length) & (df.P_Value >= p_value_threshold)]
return df

def find_common_cnvs(file1,file2, max_overlap):
"""
Find CNVs common to both samples that meet qualifying criteria and create
output file of results
"""

common_cnvs = pd.DataFrame(columns = file1.columns)
file2_byChrom = file2.groupby('Chrom')

for i in range(len(file1)):
file2_cnvs = file2_byChrom.get_group(file1.iloc[i].Chrom)
file2_cnvs = file2_cnvs.loc[file2_cnvs.Type.isin(file1.iloc[[i]].Type)]
file2_cnvs = file2_cnvs.loc[(np.abs(file2_cnvs.Start.subtract(file1.iloc[i].Start)) <= max_overlap)
& (np.abs(file2_cnvs.Stop.subtract(file1.iloc[i].Stop)) <= max_overlap)]

if(len(file2_cnvs) > 0):
common_cnvs = pd.concat([common_cnvs, file1.iloc[[i]], file2_cnvs]).reset_index(drop = True)

print(common_cnvs.info())
return common_cnvs

def common_cnv_finder(fpath1,fpath2,file_out="common_cnvs.csv",min_cnv_length=1000, p_value_threshold=.90,
max_overlap = 25):

# Read datafiles
df1 = read_cnv_file(fpath1,1)
df2 = read_cnv_file(fpath2,2)

# Do initial filtering
df1 = filter_by_cnv(df1,min_cnv_length=min_cnv_length,p_value_threshold=p_value_threshold)
df2 = filter_by_cnv(df2,min_cnv_length=min_cnv_length,p_value_threshold=p_value_threshold)

common_cnvs = find_common_cnvs(df1,df2, max_overlap)
common_cnvs_output = common_cnvs.to_csv(file_out)
return common_cnvs

1 change: 0 additions & 1 deletion project_spring_2020/sample_file.py

This file was deleted.

22 changes: 22 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import setuptools

with open("README.md", "r") as fh:
long_description = fh.read()

setuptools.setup(
name="common_cnv_finder",
version="0.0.1",
author="Ava Papetti",
author_email="avapapetti@gmail.com",
description="A tool to detect shared copy number variant (CNV) regions found within two different genomes",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/avapapetti/project_spring_2020",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)
Binary file added tests/Data/.DS_Store
Binary file not shown.
49 changes: 49 additions & 0 deletions tests/Data/.ipynb_checkpoints/Test_Sample1.cnv-checkpoint.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Chrom Start Stop Type P_Value
chr1 53110 92927 DUPLICATION 1
chr1 31203389 31206304 DELETION 0.999999783181523
chr2 137978290 137980038 DUPLICATION 0.999980773091669
chr2 30919195 30919987 DELETION 0.99995494106948
chr3 5494366 5497663 DUPLICATION 0.999999999978001
chr3 198196550 198229677 DELETION 0.999999999999988
chr4 1011404 1011901 DUPLICATION 0.999999534684394
chr4 75790050 75800535 DUPLICATION 0.999999940290934
chr5 18096452 18097200 DUPLICATION 0.999993151035016
chr5 181439550 181463033 DELETION 0.999999999999265
chr6 2200961 2202186 DUPLICATION 0.99998706982702
chr6 170621577 170740361 DELETION 1
chr7 41913860 41916331 DUPLICATION 0.99999909427962
chr7 52676425 52677535 DUPLICATION 0.999996701854162
chr8 1798228 1799873 DELETION 0.999999696931793
chr8 145038307 145057131 DELETION 0.999999999998389
chr9 1173739 1175376 DUPLICATION 0.999980759431522
chr9 138300339 138317841 DELETION 0.999999742953875
chr10 1439103 1440467 DUPLICATION 0.999999319250502
chr10 50173888 50186191 DELETION 1
chr11 4575125 4575469 DUPLICATION 0.999975123832302
chr11 96553992 96555212 DUPLICATION 0.999999999846016
chr12 27240416 27241742 DUPLICATION 0.99996691533253
chr12 61808484 61816684 DUPLICATION 0.999999997937608
chr13 18219045 18220963 DELETION 0.999999999999636
chr13 114180269 114198363 DELETION 0.999993248734387
chr14 18906159 18909123 DUPLICATION 0.999999998466087
chr14 106768429 106768612 DELETION 0.99999983514737
chr15 20494427 20504554 DUPLICATION 0.999999999692232
chr15 101973767 101978709 DELETION 1
chr16 11628 13625 DUPLICATION 0.99999923753466
chr16 89847409 89849498 DELETION 0.99999904477202
chr17 27579976 27590785 DELETION 0.999992685507534
chr17 82103298 82107638 DELETION 0.99998221513287
chr18 10797082 10797734 DELETION 0.999958639917817
chr18 80230004 80248450 DELETION 0.999999999992972
chr19 20820429 20823304 DUPLICATION 0.999999998919136
chr19 58556219 58563749 DELETION 0.999990398061118
chr20 25784188 25795289 DUPLICATION 0.999999369776991
chr20 64275162 64298737 DELETION 0.999999999998537
chr21 5061615 5073442 DUPLICATION 1
chr21 46543530 46543739 DELETION 1
chr22 11068278 11159907 DUPLICATION 1
chr22 50717399 50719104 DELETION 0.999997967384685
chrM 65 320 DUPLICATION 1
chrX 7843830 7846269 DUPLICATION 0.999999921825901
chrX 154341398 154651908 DELETION 1
chrY 11137557 11149178 DELETION 0.999999997160877
49 changes: 49 additions & 0 deletions tests/Data/Test_Sample1.cnv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Chrom Start Stop Type P_Value
chr1 53110 92927 DUPLICATION 1
chr1 31203389 31206312 DELETION 0.999999783181523
chr2 137978290 137980038 DUPLICATION 0.999980773091669
chr2 30919195 30919987 DELETION 0.99995494106948
chr3 5494366 5497663 DUPLICATION 0.999999999978001
chr3 198196550 198229677 DELETION 0.999999999999988
chr4 1011404 1011901 DUPLICATION 0.999999534684394
chr4 75790050 75800535 DUPLICATION 0.999999940290934
chr5 18096452 18097200 DUPLICATION 0.999993151035016
chr5 181439550 181463033 DELETION 0.999999999999265
chr6 2200961 2202186 DUPLICATION 0.99998706982702
chr6 170621577 170740361 DELETION 1
chr7 41913860 41916331 DUPLICATION 0.99999909427962
chr7 52676425 52677535 DUPLICATION 0.999996701854162
chr8 1798228 1799873 DELETION 0.999999696931793
chr8 145038307 145057131 DELETION 0.999999999998389
chr9 1173739 1175376 DUPLICATION 0.999980759431522
chr9 138300339 138317841 DELETION 0.999999742953875
chr10 1439103 1440467 DUPLICATION 0.999999319250502
chr10 50173888 50186191 DELETION 1
chr11 4575125 4575469 DUPLICATION 0.999975123832302
chr11 96553992 96555212 DUPLICATION 0.999999999846016
chr12 27240416 27241742 DUPLICATION 0.99996691533253
chr12 61808484 61816684 DUPLICATION 0.999999997937608
chr13 18219045 18220963 DELETION 0.999999999999636
chr13 114180269 114198363 DELETION 0.999993248734387
chr14 18906159 18909123 DUPLICATION 0.999999998466087
chr14 106768429 106768612 DELETION 0.99999983514737
chr15 20494427 20504554 DUPLICATION 0.999999999692232
chr15 101973767 101978709 DELETION 1
chr16 11628 13625 DUPLICATION 0.99999923753466
chr16 89847409 89849498 DELETION 0.99999904477202
chr17 27579976 27590785 DELETION 0.999992685507534
chr17 82103298 82107638 DELETION 0.99998221513287
chr18 10797082 10797734 DELETION 0.999958639917817
chr18 80230004 80248450 DELETION 0.999999999992972
chr19 20820429 20823304 DUPLICATION 0.999999998919136
chr19 58556219 58563749 DELETION 0.999990398061118
chr20 25784188 25795289 DUPLICATION 0.999999369776991
chr20 64275162 64298737 DELETION 0.999999999998537
chr21 5061615 5073442 DUPLICATION 1
chr21 46543530 46543739 DELETION 1
chr22 11068278 11159907 DUPLICATION 1
chr22 50717399 50719104 DELETION 0.999997967384685
chrM 65 320 DUPLICATION 1
chrX 7843830 7846269 DUPLICATION 0.999999921825901
chrX 154341398 154651908 DELETION 1
chrY 11137557 11149178 DELETION 0.999999997160877
48 changes: 48 additions & 0 deletions tests/Data/Test_Sample2.cnv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
Chrom Start Stop Type P_Value
chr1 13274 19278 DUPLICATION 1
chr1 31203414 31206304 DELETION 0.999998812954422
chr2 1222955 1223210 DUPLICATION 0.999999777157226
chr2 137978290 137980038 DUPLICATION 0.99999338057848
chr3 237690 241733 DUPLICATION 0.999996378906013
chr3 198225001 198230020 DELETION 0.999999998298312
chr4 75790060 75800535 DUPLICATION 0.999999958449423
chr4 110327977 110332609 DUPLICATION 0.99998550528721
chr5 67061516 67061821 DUPLICATION 0.999995292954743
chr5 181420258 181477061 DELETION 1
chr6 114841 128709 DUPLICATION 0.999999989583141
chr6 170439325 170447140 DELETION 1
chr7 41913855 41916331 DUPLICATION 0.999993748149294
chr7 159150246 159152355 DELETION 1
chr8 2247429 2250380 DUPLICATION 0.999999488458097
chr8 1798228 1799872 DELETION 0.999999937038322
chr9 24594183 24596543 DUPLICATION 0.999955076722721
chr9 26025381 26042387 DUPLICATION 0.999989253966844
chr10 7405808 7406456 DUPLICATION 0.999999960198357
chr10 50173888 50186191 DELETION 1
chr11 6100321 6100983 DUPLICATION 0.999997642629142
chr11 96553992 96555212 DUPLICATION 0.999999675379747
chr12 27240416 27241742 DUPLICATION 0.999999933458434
chr12 42626629 42633652 DUPLICATION 1
chr13 21322564 21327478 DELETION 0.999999999729477
chr13 114150560 114151653 DELETION 0.999994409888436
chr14 18280638 18348181 DUPLICATION 1
chr14 106771002 106775160 DELETION 1
chr15 20372992 20384781 DUPLICATION 1
chr15 101977319 101977983 DELETION 0.999999929760797
chr16 14892633 14975447 DUPLICATION 1
chr16 90165030 90199514 DELETION 1
chr17 314358 315007 DUPLICATION 0.999999999999652
chr17 83056134 83068157 DELETION 0.999999999730427
chr18 11908 15099 DUPLICATION 0.999999994567176
chr18 80144309 80147474 DELETION 0.999964957809878
chr19 20820429 20823279 DUPLICATION 0.999999672097788
chr19 42958289 42960950 DUPLICATION 0.999999835381671
chr20 9772372 9772499 DELETION 0.999991813005804
chr20 64290627 64330372 DELETION 1
chr21 8540901 8546312 DUPLICATION 0.999999556728077
chr21 46196722 46203660 DELETION 0.999998629232205
chr22 50696223 50744535 DELETION 1
chr22 50788066 50792363 DELETION 0.999999988697619
chrM 236 358 DUPLICATION 1
chrX 251089 156026949 DUPLICATION 1
chrY 3077771 56886639 DELETION 1
19 changes: 19 additions & 0 deletions tests/Data/expected_cnvs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Chrom Start Stop Type P_Value Sample#
chr1 31203389 31206312 DELETION 0.9999997831815229 1
chr1 31203414 31206304 DELETION 0.999998812954422 2
chr2 137978290 137980038 DUPLICATION 0.9999807730916691 1
chr2 137978290 137980038 DUPLICATION 0.9999933805784799 2
chr4 75790050 75800535 DUPLICATION 0.9999999402909342 1
chr4 75790060 75800535 DUPLICATION 0.999999958449423 2
chr7 41913860 41916331 DUPLICATION 0.9999990942796201 1
chr7 41913855 41916331 DUPLICATION 0.9999937481492941 2
chr8 1798228 1799873 DELETION 0.9999996969317929 1
chr8 1798228 1799872 DELETION 0.9999999370383221 2
chr10 50173888 50186191 DELETION 1.0 1
chr10 50173888 50186191 DELETION 1.0 2
chr11 96553992 96555212 DUPLICATION 0.9999999998460161 1
chr11 96553992 96555212 DUPLICATION 0.999999675379747 2
chr12 27240416 27241742 DUPLICATION 0.99996691533253 1
chr12 27240416 27241742 DUPLICATION 0.9999999334584341 2
chr19 20820429 20823304 DUPLICATION 0.9999999989191359 1
chr19 20820429 20823279 DUPLICATION 0.9999996720977881 2
9 changes: 0 additions & 9 deletions tests/sample_test.py

This file was deleted.

52 changes: 52 additions & 0 deletions tests/test_core_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest
import pandas as pd
import numpy as np
import random
from pathlib import Path

# import core functions as the alias ccf for convenience
import common_cnv_finder.core_functions as ccf

# Define and read some input files contained in the data subdirectory of tests
test_data_dir = Path(__file__).parent / "Data"
file1 = pd.read_csv(test_data_dir / "Test_Sample1.cnv.csv", sep = "\t")
file2 = pd.read_csv(test_data_dir / "Test_Sample2.cnv.csv", sep = "\t")

file1['Sample#'] = 1
file2['Sample#'] = 2

file3 = pd.read_csv(test_data_dir / "expected_cnvs.csv", sep = "\t")


def test_read_cnv_file():
expected = file3.columns
result = ccf.read_cnv_file(test_data_dir / "Test_Sample1.cnv.csv",1)
assert expected.equals(result.columns)


def test_filter_by_cnv():
expected = file1
expected = expected.loc[expected.Chrom != 'chrM']
expected['CNV_Length'] = np.abs(file1.Start - file1.Stop)
expected = expected.loc[expected.CNV_Length >= 1000]
result = ccf.filter_by_cnv(file1, min_cnv_length = 1000, p_value_threshold = .90)

assert 'chrM' not in result.Chrom
assert expected.CNV_Length.min() >= 1000
pd.testing.assert_frame_equal(result, expected.drop(columns = ['CNV_Length']), check_dtype=False)


def test_common_cnv_finder():
expected = file3
result = ccf.common_cnv_finder(test_data_dir / "Test_Sample1.cnv.csv",test_data_dir /
"Test_Sample2.cnv.csv",file_out="test_output.csv", min_cnv_length = 1000,
p_value_threshold = 0.90, max_overlap = 25)

pd.testing.assert_frame_equal(result, expected, check_dtype=False)


def test_find_common_cnvs():
expected = file3
result = ccf.find_common_cnvs(ccf.filter_by_cnv(file1, min_cnv_length = 1000, p_value_threshold = .90),
ccf.filter_by_cnv(file2, min_cnv_length = 1000, p_value_threshold = .90), max_overlap = 25)
pd.testing.assert_frame_equal(result, expected, check_dtype=False)
19 changes: 19 additions & 0 deletions tests/test_output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
,Chrom,Start,Stop,Type,P_Value,Sample#
0,chr1,31203389,31206312,DELETION,0.9999997831815229,1
1,chr1,31203414,31206304,DELETION,0.999998812954422,2
2,chr2,137978290,137980038,DUPLICATION,0.9999807730916691,1
3,chr2,137978290,137980038,DUPLICATION,0.9999933805784799,2
4,chr4,75790050,75800535,DUPLICATION,0.9999999402909342,1
5,chr4,75790060,75800535,DUPLICATION,0.999999958449423,2
6,chr7,41913860,41916331,DUPLICATION,0.9999990942796201,1
7,chr7,41913855,41916331,DUPLICATION,0.9999937481492941,2
8,chr8,1798228,1799873,DELETION,0.9999996969317929,1
9,chr8,1798228,1799872,DELETION,0.9999999370383221,2
10,chr10,50173888,50186191,DELETION,1.0,1
11,chr10,50173888,50186191,DELETION,1.0,2
12,chr11,96553992,96555212,DUPLICATION,0.9999999998460161,1
13,chr11,96553992,96555212,DUPLICATION,0.999999675379747,2
14,chr12,27240416,27241742,DUPLICATION,0.99996691533253,1
15,chr12,27240416,27241742,DUPLICATION,0.9999999334584341,2
16,chr19,20820429,20823304,DUPLICATION,0.9999999989191359,1
17,chr19,20820429,20823279,DUPLICATION,0.9999996720977881,2