From ac255b71a07e76ea1c2cf51759899c817e2a1dcf Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Tue, 8 Nov 2016 20:13:31 +0100 Subject: [PATCH 01/13] Added generic bundle for downloading raw data from a URL --- build.sbt | 5 +++-- src/main/scala/RawDataBundle.scala | 34 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 src/main/scala/RawDataBundle.scala diff --git a/build.sbt b/build.sbt index 23941fd..a6d2adb 100644 --- a/build.sbt +++ b/build.sbt @@ -11,7 +11,8 @@ libraryDependencies ++= Seq ( "bio4j" %% "data-uniprot" % "0.1.1", "org.scala-lang.modules" %% "scala-xml" % "1.0.5", "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0-RC3", - "ohnosequences" %% "fastarious" % "0.6.0" + "ohnosequences" %% "fastarious" % "0.6.0", + "ohnosequences" %% "statika" % "2.0.0-M5" ) ++ testDependencies lazy val testDependencies = Seq ( @@ -21,7 +22,7 @@ lazy val testDependencies = Seq ( dependencyOverrides := Set ( "org.scala-lang.modules" %% "scala-xml" % "1.0.5", "org.scala-lang" % "scala-library" % "2.11.8", - "com.github.pathikrit" %% "better-files" % "2.13.0" + "com.github.pathikrit" %% "better-files" % "2.16.0" ) wartremoverExcluded ++= Seq( diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala new file mode 100644 index 0000000..d3eac09 --- /dev/null +++ b/src/main/scala/RawDataBundle.scala @@ -0,0 +1,34 @@ +package com.bio4j.data + +import ohnosequences.statika._ +import java.net.URL +import sys.process._ +import better.files._ + + +abstract class RawDataBundle( + val url: URL, + val baseDirectory: File +) extends AnyBundle { + + lazy val destination: File = (baseDirectory / url.getFile).createIfNotExists() + + def instructions: AnyInstructions = { + + lazy val inputStream = { + val stream = url.openStream + if (url.getFile.endsWith(".gz")) stream.gzipped + else stream + } + + LazyTry { + for { + is <- inputStream.autoClosed + os <- destination.outputStream + } yield is > os + // TODO: some retry logic? + } ->- + say(s"${url} is downloaded and unpacked to ${destination}") + } + +} From b9b1b29d67a4cd93100a580cfbf513a958388ccd Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Sat, 8 Oct 2016 21:00:40 +0200 Subject: [PATCH 02/13] Changed to download from multiple URLs --- src/main/scala/RawDataBundle.scala | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala index d3eac09..2c14650 100644 --- a/src/main/scala/RawDataBundle.scala +++ b/src/main/scala/RawDataBundle.scala @@ -1,34 +1,36 @@ package com.bio4j.data import ohnosequences.statika._ +import ohnosequences.awstools._, s3._ import java.net.URL import sys.process._ import better.files._ -abstract class RawDataBundle( - val url: URL, - val baseDirectory: File +abstract class GetRawData( + val urls: Set[URL], + val baseDirectory: File, + val gunzip: Boolean ) extends AnyBundle { - lazy val destination: File = (baseDirectory / url.getFile).createIfNotExists() + def destination(url: URL): File = (baseDirectory / url.getFile).createIfNotExists() - def instructions: AnyInstructions = { - - lazy val inputStream = { - val stream = url.openStream - if (url.getFile.endsWith(".gz")) stream.gzipped - else stream - } + def inputStream(url: URL) = { + val stream = url.openStream + if (gunzip && url.getFile.endsWith(".gz")) stream.gzipped + else stream + } + def instructions: AnyInstructions = { LazyTry { for { - is <- inputStream.autoClosed - os <- destination.outputStream - } yield is > os + url <- urls + inS <- inputStream(url).autoClosed + outS <- destination(url).outputStream + } yield inS pipeTo outS // TODO: some retry logic? } ->- - say(s"${url} is downloaded and unpacked to ${destination}") + say(s"Files are downloaded to ${baseDirectory}") } } From e90cd113c3699defbd794acad60e8bf067814def Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Sun, 9 Oct 2016 14:20:13 +0200 Subject: [PATCH 03/13] Added abstract bundle for copying files to S3 --- src/main/scala/RawDataBundle.scala | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala index 2c14650..9121a86 100644 --- a/src/main/scala/RawDataBundle.scala +++ b/src/main/scala/RawDataBundle.scala @@ -1,7 +1,9 @@ package com.bio4j.data import ohnosequences.statika._ +import com.amazonaws.auth._ import ohnosequences.awstools._, s3._ +import com.amazonaws.services.s3.transfer._ import java.net.URL import sys.process._ import better.files._ @@ -34,3 +36,39 @@ abstract class GetRawData( } } + + +abstract class CopyToS3( + val file: File, + val s3folder: S3Folder +) extends AnyBundle { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + + LazyTry { + if (file.isDirectory) { + val target = s3folder / file.name + + transferManager.upload( + target.bucket, target.key, + file.toJava + ).waitForCompletion + } else { + + transferManager.uploadDirectory( + s3folder.bucket, s3folder.key, + file.toJava, + true // recursively + ).waitForCompletion + } + } -&- LazyTry { + transferManager.shutdownNow() + } ->- + say(s"Files are uploaded to ${s3folder.url}") + } + +} From fccf675374bd7ddd5a23f5dfa2d2ae2bbe07814b Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Sun, 9 Oct 2016 16:12:34 +0200 Subject: [PATCH 04/13] Added an abstract bundle to download mirrored data --- src/main/scala/RawDataBundle.scala | 38 +++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala index 9121a86..cc959ee 100644 --- a/src/main/scala/RawDataBundle.scala +++ b/src/main/scala/RawDataBundle.scala @@ -39,36 +39,52 @@ abstract class GetRawData( abstract class CopyToS3( - val file: File, + val files: Seq[File], val s3folder: S3Folder ) extends AnyBundle { lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) - lazy val transferManager = new TransferManager(s3client.s3) def instructions: AnyInstructions = { LazyTry { - if (file.isDirectory) { + files.foreach { file => + val target = s3folder / file.name transferManager.upload( target.bucket, target.key, file.toJava ).waitForCompletion - } else { - - transferManager.uploadDirectory( - s3folder.bucket, s3folder.key, - file.toJava, - true // recursively - ).waitForCompletion } - } -&- LazyTry { + transferManager.shutdownNow() } ->- say(s"Files are uploaded to ${s3folder.url}") } } + + +abstract class GetS3Copy( + val s3copy: CopyToS3, + val baseDirectory: File +) extends AnyBundle { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + LazyTry { + transferManager.downloadDirectory( + s3copy.s3folder.bucket, s3copy.s3folder.key, + baseDirectory.toJava + ).waitForCompletion + + transferManager.shutdownNow() + } ->- + say(s"Files are downloaded to ${baseDirectory}") + } + +} From f018b24532c964816f9bc7a03e388441829458e0 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Mon, 21 Nov 2016 17:23:10 +0100 Subject: [PATCH 05/13] Added bundles for enzyme data --- src/main/scala/RawDataBundle.scala | 9 +++++---- src/main/scala/enzyme/bundles.scala | 30 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 src/main/scala/enzyme/bundles.scala diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala index cc959ee..cc8e307 100644 --- a/src/main/scala/RawDataBundle.scala +++ b/src/main/scala/RawDataBundle.scala @@ -10,12 +10,13 @@ import better.files._ abstract class GetRawData( - val urls: Set[URL], + val urls: Seq[URL], val baseDirectory: File, val gunzip: Boolean -) extends AnyBundle { +)(deps: AnyBundle*) extends Bundle(deps: _*) { def destination(url: URL): File = (baseDirectory / url.getFile).createIfNotExists() + def files: Seq[File] = urls.map(destination) def inputStream(url: URL) = { val stream = url.openStream @@ -41,7 +42,7 @@ abstract class GetRawData( abstract class CopyToS3( val files: Seq[File], val s3folder: S3Folder -) extends AnyBundle { +)(deps: AnyBundle*) extends Bundle(deps: _*) { lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) lazy val transferManager = new TransferManager(s3client.s3) @@ -70,7 +71,7 @@ abstract class CopyToS3( abstract class GetS3Copy( val s3copy: CopyToS3, val baseDirectory: File -) extends AnyBundle { +)(deps: AnyBundle*) extends Bundle(deps: _*) { lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) lazy val transferManager = new TransferManager(s3client.s3) diff --git a/src/main/scala/enzyme/bundles.scala b/src/main/scala/enzyme/bundles.scala new file mode 100644 index 0000000..7a88248 --- /dev/null +++ b/src/main/scala/enzyme/bundles.scala @@ -0,0 +1,30 @@ +package com.bio4j.data.enzyme + +import com.bio4j.data._ +import java.net.URL +import better.files._ + +case object bundles { + + case object rawData extends GetRawData( + urls = Seq( + "enzyme.dat", + "enzclass.txt" + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/enzyme/release/${suffix}") + }, + baseDirectory = file"/media/ephemeral0/enzyme/raw/", + gunzip = false + )() + + case object copyData extends CopyToS3( + rawData.files, + ??? + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/enzyme/data/" + )() + +} From b0c8dad76729265f07c49ba91ed14f9f9e3b245d Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Tue, 22 Nov 2016 14:34:33 +0100 Subject: [PATCH 06/13] Adde GO and NCBITaxonomy data bundles --- build.sbt | 9 ++- src/main/scala/RawDataBundle.scala | 91 --------------------- src/main/scala/bundles.scala | 96 +++++++++++++++++++++++ src/main/scala/enzyme/bundles.scala | 6 +- src/main/scala/go/bundles.scala | 20 +++++ src/main/scala/ncbiTaxonomy/bundles.scala | 27 +++++++ 6 files changed, 151 insertions(+), 98 deletions(-) delete mode 100644 src/main/scala/RawDataBundle.scala create mode 100644 src/main/scala/bundles.scala create mode 100644 src/main/scala/go/bundles.scala create mode 100644 src/main/scala/ncbiTaxonomy/bundles.scala diff --git a/build.sbt b/build.sbt index a6d2adb..cb69f4e 100644 --- a/build.sbt +++ b/build.sbt @@ -25,7 +25,8 @@ dependencyOverrides := Set ( "com.github.pathikrit" %% "better-files" % "2.16.0" ) -wartremoverExcluded ++= Seq( - baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala", - baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala" -) +wartremoverErrors in (Compile, compile) := Seq() +// wartremoverExcluded ++= Seq( +// baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala", +// baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala" +// ) diff --git a/src/main/scala/RawDataBundle.scala b/src/main/scala/RawDataBundle.scala deleted file mode 100644 index cc8e307..0000000 --- a/src/main/scala/RawDataBundle.scala +++ /dev/null @@ -1,91 +0,0 @@ -package com.bio4j.data - -import ohnosequences.statika._ -import com.amazonaws.auth._ -import ohnosequences.awstools._, s3._ -import com.amazonaws.services.s3.transfer._ -import java.net.URL -import sys.process._ -import better.files._ - - -abstract class GetRawData( - val urls: Seq[URL], - val baseDirectory: File, - val gunzip: Boolean -)(deps: AnyBundle*) extends Bundle(deps: _*) { - - def destination(url: URL): File = (baseDirectory / url.getFile).createIfNotExists() - def files: Seq[File] = urls.map(destination) - - def inputStream(url: URL) = { - val stream = url.openStream - if (gunzip && url.getFile.endsWith(".gz")) stream.gzipped - else stream - } - - def instructions: AnyInstructions = { - LazyTry { - for { - url <- urls - inS <- inputStream(url).autoClosed - outS <- destination(url).outputStream - } yield inS pipeTo outS - // TODO: some retry logic? - } ->- - say(s"Files are downloaded to ${baseDirectory}") - } - -} - - -abstract class CopyToS3( - val files: Seq[File], - val s3folder: S3Folder -)(deps: AnyBundle*) extends Bundle(deps: _*) { - - lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) - lazy val transferManager = new TransferManager(s3client.s3) - - def instructions: AnyInstructions = { - - LazyTry { - files.foreach { file => - - val target = s3folder / file.name - - transferManager.upload( - target.bucket, target.key, - file.toJava - ).waitForCompletion - } - - transferManager.shutdownNow() - } ->- - say(s"Files are uploaded to ${s3folder.url}") - } - -} - - -abstract class GetS3Copy( - val s3copy: CopyToS3, - val baseDirectory: File -)(deps: AnyBundle*) extends Bundle(deps: _*) { - - lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) - lazy val transferManager = new TransferManager(s3client.s3) - - def instructions: AnyInstructions = { - LazyTry { - transferManager.downloadDirectory( - s3copy.s3folder.bucket, s3copy.s3folder.key, - baseDirectory.toJava - ).waitForCompletion - - transferManager.shutdownNow() - } ->- - say(s"Files are downloaded to ${baseDirectory}") - } - -} diff --git a/src/main/scala/bundles.scala b/src/main/scala/bundles.scala new file mode 100644 index 0000000..2f3a914 --- /dev/null +++ b/src/main/scala/bundles.scala @@ -0,0 +1,96 @@ +package com.bio4j.data + +import ohnosequences.statika._ +import com.amazonaws.auth._ +import ohnosequences.awstools._, s3._ +import com.amazonaws.services.s3.transfer._ +import java.net.URL +import sys.process._ +import better.files._ + +case object bundles { + + val releasesPrefix = S3Folder("releases.bio4j.com", "2016_12_01") + + abstract class GetRawData( + val urls: Seq[URL], + val baseDirectory: File, + val gunzip: Boolean + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + def destination(url: URL): File = (baseDirectory / url.getFile).createIfNotExists() + def files: Seq[File] = urls.map(destination) + + def inputStream(url: URL) = { + val stream = url.openStream + if (gunzip && url.getFile.endsWith(".gz")) stream.gzipped + else stream + } + + def instructions: AnyInstructions = { + LazyTry { + for { + url <- urls + inS <- inputStream(url).autoClosed + outS <- destination(url).outputStream + } yield inS pipeTo outS + // TODO: some retry logic? + } ->- + say(s"Files are downloaded to ${baseDirectory}") + } + + } + + + abstract class CopyToS3( + val files: Seq[File], + val s3folder: S3Folder + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + + LazyTry { + files.foreach { file => + + val target = s3folder / file.name + + transferManager.upload( + target.bucket, target.key, + file.toJava + ).waitForCompletion + } + + transferManager.shutdownNow() + } ->- + say(s"Files are uploaded to ${s3folder.url}") + } + + } + + + abstract class GetS3Copy( + val s3copy: CopyToS3, + val baseDirectory: File + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + LazyTry { + transferManager.downloadDirectory( + s3copy.s3folder.bucket, s3copy.s3folder.key, + baseDirectory.toJava + ).waitForCompletion + + transferManager.shutdownNow() + } ->- + say(s"Files are downloaded to ${baseDirectory}") + } + + } + +} diff --git a/src/main/scala/enzyme/bundles.scala b/src/main/scala/enzyme/bundles.scala index 7a88248..7479b07 100644 --- a/src/main/scala/enzyme/bundles.scala +++ b/src/main/scala/enzyme/bundles.scala @@ -1,6 +1,6 @@ package com.bio4j.data.enzyme -import com.bio4j.data._ +import com.bio4j.data.bundles._ import java.net.URL import better.files._ @@ -13,13 +13,13 @@ case object bundles { ).map { suffix => new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/enzyme/release/${suffix}") }, - baseDirectory = file"/media/ephemeral0/enzyme/raw/", + baseDirectory = file"/media/ephemeral0/enzyme/data/", gunzip = false )() case object copyData extends CopyToS3( rawData.files, - ??? + releasesPrefix / "data" / "enzyme" / )() case object mirroredData extends GetS3Copy( diff --git a/src/main/scala/go/bundles.scala b/src/main/scala/go/bundles.scala new file mode 100644 index 0000000..39db349 --- /dev/null +++ b/src/main/scala/go/bundles.scala @@ -0,0 +1,20 @@ +package com.bio4j.data.go + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + val release: String = "latest" + + case object rawData extends GetRawData( + urls = Seq( + // NOTE: this is daily automatic build, I'm not sure this is the source we want + new URL("http", "archive.geneontology.org", s"/termdb/${release}/go_daily-termdb.obo-xml.gz") + ), + baseDirectory = file"/media/ephemeral0/go/data/", + gunzip = true + )() + +} diff --git a/src/main/scala/ncbiTaxonomy/bundles.scala b/src/main/scala/ncbiTaxonomy/bundles.scala new file mode 100644 index 0000000..480a552 --- /dev/null +++ b/src/main/scala/ncbiTaxonomy/bundles.scala @@ -0,0 +1,27 @@ +package com.bio4j.data.ncbiTaxonomy + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + case object rawData extends GetRawData( + urls = Seq( + new URL("ftp", "ftp.ncbi.nih.gov", "/pub/taxonomy/taxdump.tar.gz") + ), + baseDirectory = file"/media/ephemeral0/ncbiTaxonomy/raw/", + gunzip = true + )() + + case object copyData extends CopyToS3( + rawData.files, + releasesPrefix / "data" / "ncbiTaxonomy" / + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/ncbiTaxonomy/data/" + )() + +} From e3493cb2fad919bd12228539ecacdea6bf47df6e Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Tue, 22 Nov 2016 17:07:01 +0100 Subject: [PATCH 07/13] Added references to the files involved --- src/main/scala/bundles.scala | 14 ++++++++++--- src/main/scala/enzyme/bundles.scala | 21 +++++++++++++------ src/main/scala/go/bundles.scala | 21 +++++++++++++++++-- src/main/scala/ncbiTaxonomy/bundles.scala | 25 +++++++++++++++++------ 4 files changed, 64 insertions(+), 17 deletions(-) diff --git a/src/main/scala/bundles.scala b/src/main/scala/bundles.scala index 2f3a914..bf93a3f 100644 --- a/src/main/scala/bundles.scala +++ b/src/main/scala/bundles.scala @@ -10,7 +10,7 @@ import better.files._ case object bundles { - val releasesPrefix = S3Folder("releases.bio4j.com", "2016_12_01") + val s3ReleasesPrefix = S3Folder("releases.bio4j.com", "2016_12_01") abstract class GetRawData( val urls: Seq[URL], @@ -18,8 +18,16 @@ case object bundles { val gunzip: Boolean )(deps: AnyBundle*) extends Bundle(deps: _*) { - def destination(url: URL): File = (baseDirectory / url.getFile).createIfNotExists() - def files: Seq[File] = urls.map(destination) + def destination(url: URL): File = { + val urlFile = url.getFile + val name = + if (gunzip && urlFile.endsWith(".gz")) urlFile.stripSuffix(".gz") + else urlFile + + (baseDirectory / name).createIfNotExists() + } + + lazy val files: Seq[File] = urls.map(destination) def inputStream(url: URL) = { val stream = url.openStream diff --git a/src/main/scala/enzyme/bundles.scala b/src/main/scala/enzyme/bundles.scala index 7479b07..097dd06 100644 --- a/src/main/scala/enzyme/bundles.scala +++ b/src/main/scala/enzyme/bundles.scala @@ -6,25 +6,34 @@ import better.files._ case object bundles { + case object fileNames { + val enzyme = "enzyme.dat" + val enzclass = "enzclass.txt" + } + case object rawData extends GetRawData( urls = Seq( - "enzyme.dat", - "enzclass.txt" + fileNames.enzyme, + fileNames.enzclass ).map { suffix => new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/enzyme/release/${suffix}") }, - baseDirectory = file"/media/ephemeral0/enzyme/data/", + baseDirectory = file"/media/ephemeral0/data/enzyme/", gunzip = false )() case object copyData extends CopyToS3( rawData.files, - releasesPrefix / "data" / "enzyme" / + s3ReleasesPrefix / "data" / "enzyme" / )() case object mirroredData extends GetS3Copy( copyData, - file"/media/ephemeral0/enzyme/data/" - )() + file"/media/ephemeral0/data/enzyme/" + )() { + + val enzyme = baseDirectory / fileNames.enzyme + val enzclass = baseDirectory / fileNames.enzclass + } } diff --git a/src/main/scala/go/bundles.scala b/src/main/scala/go/bundles.scala index 39db349..6df7019 100644 --- a/src/main/scala/go/bundles.scala +++ b/src/main/scala/go/bundles.scala @@ -8,13 +8,30 @@ case object bundles { val release: String = "latest" + case object fileNames { + val obo = "go_daily-termdb.obo-xml" + } + case object rawData extends GetRawData( urls = Seq( // NOTE: this is daily automatic build, I'm not sure this is the source we want - new URL("http", "archive.geneontology.org", s"/termdb/${release}/go_daily-termdb.obo-xml.gz") + new URL("http", "archive.geneontology.org", s"/termdb/${release}/${fileNames.obo}.gz") ), - baseDirectory = file"/media/ephemeral0/go/data/", + baseDirectory = file"/media/ephemeral0/data/go/", gunzip = true )() + case object copyData extends CopyToS3( + rawData.files, + s3ReleasesPrefix / "data" / "go" / + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/data/go/" + )() { + + val obo = baseDirectory / fileNames.obo + } + } diff --git a/src/main/scala/ncbiTaxonomy/bundles.scala b/src/main/scala/ncbiTaxonomy/bundles.scala index 480a552..60f2a6c 100644 --- a/src/main/scala/ncbiTaxonomy/bundles.scala +++ b/src/main/scala/ncbiTaxonomy/bundles.scala @@ -6,22 +6,35 @@ import better.files._ case object bundles { + case object fileNames { + val nodes = "nodes.dmp" + val names = "names.dmp" + } + case object rawData extends GetRawData( urls = Seq( new URL("ftp", "ftp.ncbi.nih.gov", "/pub/taxonomy/taxdump.tar.gz") ), - baseDirectory = file"/media/ephemeral0/ncbiTaxonomy/raw/", + baseDirectory = file"/media/ephemeral0/data/ncbiTaxonomy/", gunzip = true - )() + )() { + + val nodes = baseDirectory / "taxdump" / fileNames.nodes + val names = baseDirectory / "taxdump" / fileNames.names + } case object copyData extends CopyToS3( - rawData.files, - releasesPrefix / "data" / "ncbiTaxonomy" / + Seq(rawData.nodes, rawData.names), + s3ReleasesPrefix / "data" / "ncbiTaxonomy" / )() case object mirroredData extends GetS3Copy( copyData, - file"/media/ephemeral0/ncbiTaxonomy/data/" - )() + file"/media/ephemeral0/data/ncbiTaxonomy/" + )() { + + val nodes = baseDirectory / fileNames.nodes + val names = baseDirectory / fileNames.names + } } From a8d5c870c3b3f5789331a8a731b2d103bb9fb794 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Tue, 22 Nov 2016 17:27:56 +0100 Subject: [PATCH 08/13] Added UniRef --- src/main/scala/uniref/bundles.scala | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/main/scala/uniref/bundles.scala diff --git a/src/main/scala/uniref/bundles.scala b/src/main/scala/uniref/bundles.scala new file mode 100644 index 0000000..8dd7c1d --- /dev/null +++ b/src/main/scala/uniref/bundles.scala @@ -0,0 +1,31 @@ +package com.bio4j.data.uniref + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + // NOTE: only old releases have a date-tag + val release = "current_release" + + case object fileNames { + val uniref50 = "uniref50.xml" // 8.5GB gz + val uniref90 = "uniref90.xml" // 15.4GB gz + val uniref100 = "uniref100.xml" // 27.7GB gz + } + + // TODO: probably it's better to make 3 separate data and import bundles + case object rawData extends GetRawData( + urls = Seq( + fileNames.uniref50, + fileNames.uniref90, + fileNames.uniref100 + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/uniref/${suffix}/${suffix}.xml.gz") + }, + baseDirectory = file"/media/ephemeral0/data/enzyme/", + gunzip = true + )() + +} From d4ef81a06e46634caa9027a32942e58114af0657 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Tue, 22 Nov 2016 17:32:59 +0100 Subject: [PATCH 09/13] Added UniProt --- src/main/scala/uniprot/bundles.scala | 29 ++++++++++++++++++++++++++++ src/main/scala/uniref/bundles.scala | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/uniprot/bundles.scala diff --git a/src/main/scala/uniprot/bundles.scala b/src/main/scala/uniprot/bundles.scala new file mode 100644 index 0000000..5f8cbe9 --- /dev/null +++ b/src/main/scala/uniprot/bundles.scala @@ -0,0 +1,29 @@ +package com.bio4j.data.uniprot + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + // NOTE: only old releases have a date-tag + val release = "current_release" + + case object fileNames { + val sprot = "uniprot_sprot.dat" // 517MB gz + val trembl = "uniprot_trembl.dat" // 38.9GB gz + } + + // TODO: probably it's better to make 3 separate data and import bundles + case object rawData extends GetRawData( + urls = Seq( + fileNames.sprot, + fileNames.trembl + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/knowledgebase/complete/${suffix}.gz") + }, + baseDirectory = file"/media/ephemeral0/data/enzyme/", + gunzip = true + )() + +} diff --git a/src/main/scala/uniref/bundles.scala b/src/main/scala/uniref/bundles.scala index 8dd7c1d..1ad4d3c 100644 --- a/src/main/scala/uniref/bundles.scala +++ b/src/main/scala/uniref/bundles.scala @@ -22,7 +22,7 @@ case object bundles { fileNames.uniref90, fileNames.uniref100 ).map { suffix => - new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/uniref/${suffix}/${suffix}.xml.gz") + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/uniref/${suffix}/${suffix}.gz") }, baseDirectory = file"/media/ephemeral0/data/enzyme/", gunzip = true From 78dfde0249bcf16fd188e33e48e523ec6e1faaa3 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Wed, 23 Nov 2016 13:01:04 +0100 Subject: [PATCH 10/13] Fixed S3 paths --- src/main/scala/bundles.scala | 2 +- src/main/scala/enzyme/bundles.scala | 2 +- src/main/scala/go/bundles.scala | 2 +- src/main/scala/ncbiTaxonomy/bundles.scala | 2 +- src/main/scala/uniprot/bundles.scala | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/scala/bundles.scala b/src/main/scala/bundles.scala index bf93a3f..605b77e 100644 --- a/src/main/scala/bundles.scala +++ b/src/main/scala/bundles.scala @@ -10,7 +10,7 @@ import better.files._ case object bundles { - val s3ReleasesPrefix = S3Folder("releases.bio4j.com", "2016_12_01") + val s3ReleasesPrefix = S3Folder("eu-west-1.raw.bio4j.com", "data/2016_11/") abstract class GetRawData( val urls: Seq[URL], diff --git a/src/main/scala/enzyme/bundles.scala b/src/main/scala/enzyme/bundles.scala index 097dd06..13e3cd9 100644 --- a/src/main/scala/enzyme/bundles.scala +++ b/src/main/scala/enzyme/bundles.scala @@ -24,7 +24,7 @@ case object bundles { case object copyData extends CopyToS3( rawData.files, - s3ReleasesPrefix / "data" / "enzyme" / + s3ReleasesPrefix / "enzyme" / )() case object mirroredData extends GetS3Copy( diff --git a/src/main/scala/go/bundles.scala b/src/main/scala/go/bundles.scala index 6df7019..219ec56 100644 --- a/src/main/scala/go/bundles.scala +++ b/src/main/scala/go/bundles.scala @@ -23,7 +23,7 @@ case object bundles { case object copyData extends CopyToS3( rawData.files, - s3ReleasesPrefix / "data" / "go" / + s3ReleasesPrefix / "go" / )() case object mirroredData extends GetS3Copy( diff --git a/src/main/scala/ncbiTaxonomy/bundles.scala b/src/main/scala/ncbiTaxonomy/bundles.scala index 60f2a6c..f44c5fe 100644 --- a/src/main/scala/ncbiTaxonomy/bundles.scala +++ b/src/main/scala/ncbiTaxonomy/bundles.scala @@ -25,7 +25,7 @@ case object bundles { case object copyData extends CopyToS3( Seq(rawData.nodes, rawData.names), - s3ReleasesPrefix / "data" / "ncbiTaxonomy" / + s3ReleasesPrefix / "ncbiTaxonomy" / )() case object mirroredData extends GetS3Copy( diff --git a/src/main/scala/uniprot/bundles.scala b/src/main/scala/uniprot/bundles.scala index 5f8cbe9..2c6edc8 100644 --- a/src/main/scala/uniprot/bundles.scala +++ b/src/main/scala/uniprot/bundles.scala @@ -10,8 +10,8 @@ case object bundles { val release = "current_release" case object fileNames { - val sprot = "uniprot_sprot.dat" // 517MB gz - val trembl = "uniprot_trembl.dat" // 38.9GB gz + val sprot = "uniprot_sprot.dat" // 517MB gz + val trembl = "uniprot_trembl.dat" // 38.9GB gz } // TODO: probably it's better to make 3 separate data and import bundles From 86cc42d923e4ad3847885bd80166f1fcfbafcfaa Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Wed, 23 Nov 2016 13:08:31 +0100 Subject: [PATCH 11/13] Updated minor versions of dependencies --- build.sbt | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/build.sbt b/build.sbt index cb69f4e..f2395d6 100644 --- a/build.sbt +++ b/build.sbt @@ -7,21 +7,18 @@ bucketSuffix := "era7.com" scalaVersion := "2.11.8" libraryDependencies ++= Seq ( - "bio4j" % "bio4j" % "0.12.0-227-g60cce98", - "bio4j" %% "data-uniprot" % "0.1.1", - "org.scala-lang.modules" %% "scala-xml" % "1.0.5", - "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0-RC3", - "ohnosequences" %% "fastarious" % "0.6.0", - "ohnosequences" %% "statika" % "2.0.0-M5" -) ++ testDependencies - -lazy val testDependencies = Seq ( - "org.scalatest" %% "scalatest" % "2.2.6" % Test + "bio4j" % "bio4j" % "0.12.0-227-g60cce98", + "bio4j" %% "data-uniprot" % "0.1.1", + "org.scala-lang.modules" %% "scala-xml" % "1.0.6", + "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0", + "ohnosequences" %% "fastarious" % "0.6.0", + "ohnosequences" %% "statika" % "2.0.0-M5", + "org.scalatest" %% "scalatest" % "2.2.6" % Test ) dependencyOverrides := Set ( "org.scala-lang.modules" %% "scala-xml" % "1.0.5", - "org.scala-lang" % "scala-library" % "2.11.8", + // "org.scala-lang" % "scala-library" % "2.11.8", "com.github.pathikrit" %% "better-files" % "2.16.0" ) @@ -30,3 +27,8 @@ wartremoverErrors in (Compile, compile) := Seq() // baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala", // baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala" // ) + +generateStatikaMetadataIn(Compile) + +// This turns on fat-jar publishing during release process: +publishFatArtifact in Release := true From b19567e98bf0f9c09cdb0d50c52ce53df28dbaf4 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Wed, 23 Nov 2016 13:12:48 +0100 Subject: [PATCH 12/13] Updated plugin; fixed project name --- build.sbt | 2 +- project/build.properties | 2 +- project/plugins.sbt | 7 +++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/build.sbt b/build.sbt index f2395d6..167647e 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,4 @@ -name := "import" +name := "release.generic" organization := "bio4j" description := "generic bio4j data import" diff --git a/project/build.properties b/project/build.properties index 35c88ba..27e88aa 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.12 +sbt.version=0.13.13 diff --git a/project/plugins.sbt b/project/plugins.sbt index c04e7c2..24cdb7f 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,3 +1,6 @@ -resolvers += "Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com" +resolvers ++= Seq( + "Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com", + "repo.jenkins-ci.org" at "https://repo.jenkins-ci.org/public" +) -addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC2") +addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC4") From de7cf470f72aae5a50fc8e17df185cfb42e8fb12 Mon Sep 17 00:00:00 2001 From: Alexey Alekhin Date: Wed, 23 Nov 2016 13:54:19 +0100 Subject: [PATCH 13/13] Added files references for uniprot/uniref --- src/main/scala/uniprot/bundles.scala | 12 +++++++++--- src/main/scala/uniref/bundles.scala | 7 ++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/main/scala/uniprot/bundles.scala b/src/main/scala/uniprot/bundles.scala index 2c6edc8..3af0170 100644 --- a/src/main/scala/uniprot/bundles.scala +++ b/src/main/scala/uniprot/bundles.scala @@ -10,8 +10,9 @@ case object bundles { val release = "current_release" case object fileNames { - val sprot = "uniprot_sprot.dat" // 517MB gz - val trembl = "uniprot_trembl.dat" // 38.9GB gz + val sprot = "uniprot_sprot.dat" // 517MB gz + val trembl = "uniprot_trembl.dat" // 38.9GB gz + val varsplic = "uniprot_sprot_varsplic.fasta" // 7.7MB gz } // TODO: probably it's better to make 3 separate data and import bundles @@ -24,6 +25,11 @@ case object bundles { }, baseDirectory = file"/media/ephemeral0/data/enzyme/", gunzip = true - )() + )() { + + val sprot = baseDirectory / fileNames.sprot + val trembl = baseDirectory / fileNames.trembl + val varsplic = baseDirectory / fileNames.varsplic + } } diff --git a/src/main/scala/uniref/bundles.scala b/src/main/scala/uniref/bundles.scala index 1ad4d3c..ed784de 100644 --- a/src/main/scala/uniref/bundles.scala +++ b/src/main/scala/uniref/bundles.scala @@ -26,6 +26,11 @@ case object bundles { }, baseDirectory = file"/media/ephemeral0/data/enzyme/", gunzip = true - )() + )() { + + val uniref50 = baseDirectory / fileNames.uniref50 + val uniref90 = baseDirectory / fileNames.uniref90 + val uniref100 = baseDirectory / fileNames.uniref100 + } }