Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raw input data locations #21

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
36 changes: 20 additions & 16 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name := "import"
name := "release.generic"
organization := "bio4j"
description := "generic bio4j data import"

Expand All @@ -7,24 +7,28 @@ bucketSuffix := "era7.com"
scalaVersion := "2.11.8"

libraryDependencies ++= Seq (
"bio4j" % "bio4j" % "0.12.0-227-g60cce98",
"bio4j" %% "data-uniprot" % "0.1.1",
"org.scala-lang.modules" %% "scala-xml" % "1.0.5",
"org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0-RC3",
"ohnosequences" %% "fastarious" % "0.6.0"
) ++ testDependencies

lazy val testDependencies = Seq (
"org.scalatest" %% "scalatest" % "2.2.6" % Test
"bio4j" % "bio4j" % "0.12.0-227-g60cce98",
"bio4j" %% "data-uniprot" % "0.1.1",
"org.scala-lang.modules" %% "scala-xml" % "1.0.6",
"org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0",
"ohnosequences" %% "fastarious" % "0.6.0",
"ohnosequences" %% "statika" % "2.0.0-M5",
"org.scalatest" %% "scalatest" % "2.2.6" % Test
)

dependencyOverrides := Set (
"org.scala-lang.modules" %% "scala-xml" % "1.0.5",
"org.scala-lang" % "scala-library" % "2.11.8",
"com.github.pathikrit" %% "better-files" % "2.13.0"
// "org.scala-lang" % "scala-library" % "2.11.8",
"com.github.pathikrit" %% "better-files" % "2.16.0"
)

wartremoverExcluded ++= Seq(
baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala",
baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala"
)
wartremoverErrors in (Compile, compile) := Seq()
// wartremoverExcluded ++= Seq(
// baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala",
// baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala"
// )

generateStatikaMetadataIn(Compile)

// This turns on fat-jar publishing during release process:
publishFatArtifact in Release := true
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=0.13.12
sbt.version=0.13.13
7 changes: 5 additions & 2 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
resolvers += "Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com"
resolvers ++= Seq(
"Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com",
"repo.jenkins-ci.org" at "https://repo.jenkins-ci.org/public"
)

addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC2")
addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC4")
104 changes: 104 additions & 0 deletions src/main/scala/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package com.bio4j.data

import ohnosequences.statika._
import com.amazonaws.auth._
import ohnosequences.awstools._, s3._
import com.amazonaws.services.s3.transfer._
import java.net.URL
import sys.process._
import better.files._

case object bundles {

val s3ReleasesPrefix = S3Folder("eu-west-1.raw.bio4j.com", "data/2016_11/")

abstract class GetRawData(
val urls: Seq[URL],
val baseDirectory: File,
val gunzip: Boolean
)(deps: AnyBundle*) extends Bundle(deps: _*) {

def destination(url: URL): File = {
val urlFile = url.getFile
val name =
if (gunzip && urlFile.endsWith(".gz")) urlFile.stripSuffix(".gz")
else urlFile

(baseDirectory / name).createIfNotExists()
}

lazy val files: Seq[File] = urls.map(destination)

def inputStream(url: URL) = {
val stream = url.openStream
if (gunzip && url.getFile.endsWith(".gz")) stream.gzipped
else stream
}

def instructions: AnyInstructions = {
LazyTry {
for {
url <- urls
inS <- inputStream(url).autoClosed
outS <- destination(url).outputStream
} yield inS pipeTo outS
// TODO: some retry logic?
} ->-
say(s"Files are downloaded to ${baseDirectory}")
}

}


abstract class CopyToS3(
val files: Seq[File],
val s3folder: S3Folder
)(deps: AnyBundle*) extends Bundle(deps: _*) {

lazy val s3client = S3.create(new InstanceProfileCredentialsProvider())
lazy val transferManager = new TransferManager(s3client.s3)

def instructions: AnyInstructions = {

LazyTry {
files.foreach { file =>

val target = s3folder / file.name

transferManager.upload(
target.bucket, target.key,
file.toJava
).waitForCompletion
}

transferManager.shutdownNow()
} ->-
say(s"Files are uploaded to ${s3folder.url}")
}

}


abstract class GetS3Copy(
val s3copy: CopyToS3,
val baseDirectory: File
)(deps: AnyBundle*) extends Bundle(deps: _*) {

lazy val s3client = S3.create(new InstanceProfileCredentialsProvider())
lazy val transferManager = new TransferManager(s3client.s3)

def instructions: AnyInstructions = {
LazyTry {
transferManager.downloadDirectory(
s3copy.s3folder.bucket, s3copy.s3folder.key,
baseDirectory.toJava
).waitForCompletion

transferManager.shutdownNow()
} ->-
say(s"Files are downloaded to ${baseDirectory}")
}

}

}
39 changes: 39 additions & 0 deletions src/main/scala/enzyme/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.bio4j.data.enzyme

import com.bio4j.data.bundles._
import java.net.URL
import better.files._

case object bundles {

case object fileNames {
val enzyme = "enzyme.dat"
val enzclass = "enzclass.txt"
}

case object rawData extends GetRawData(
urls = Seq(
fileNames.enzyme,
fileNames.enzclass
).map { suffix =>
new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/enzyme/release/${suffix}")
},
baseDirectory = file"/media/ephemeral0/data/enzyme/",
gunzip = false
)()

case object copyData extends CopyToS3(
rawData.files,
s3ReleasesPrefix / "enzyme" /
)()

case object mirroredData extends GetS3Copy(
copyData,
file"/media/ephemeral0/data/enzyme/"
)() {

val enzyme = baseDirectory / fileNames.enzyme
val enzclass = baseDirectory / fileNames.enzclass
}

}
37 changes: 37 additions & 0 deletions src/main/scala/go/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package com.bio4j.data.go

import com.bio4j.data.bundles._
import java.net.URL
import better.files._

case object bundles {

val release: String = "latest"

case object fileNames {
val obo = "go_daily-termdb.obo-xml"
}

case object rawData extends GetRawData(
urls = Seq(
// NOTE: this is daily automatic build, I'm not sure this is the source we want
new URL("http", "archive.geneontology.org", s"/termdb/${release}/${fileNames.obo}.gz")
),
baseDirectory = file"/media/ephemeral0/data/go/",
gunzip = true
)()

case object copyData extends CopyToS3(
rawData.files,
s3ReleasesPrefix / "go" /
)()

case object mirroredData extends GetS3Copy(
copyData,
file"/media/ephemeral0/data/go/"
)() {

val obo = baseDirectory / fileNames.obo
}

}
40 changes: 40 additions & 0 deletions src/main/scala/ncbiTaxonomy/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package com.bio4j.data.ncbiTaxonomy

import com.bio4j.data.bundles._
import java.net.URL
import better.files._

case object bundles {

case object fileNames {
val nodes = "nodes.dmp"
val names = "names.dmp"
}

case object rawData extends GetRawData(
urls = Seq(
new URL("ftp", "ftp.ncbi.nih.gov", "/pub/taxonomy/taxdump.tar.gz")
),
baseDirectory = file"/media/ephemeral0/data/ncbiTaxonomy/",
gunzip = true
)() {

val nodes = baseDirectory / "taxdump" / fileNames.nodes
val names = baseDirectory / "taxdump" / fileNames.names
}

case object copyData extends CopyToS3(
Seq(rawData.nodes, rawData.names),
s3ReleasesPrefix / "ncbiTaxonomy" /
)()

case object mirroredData extends GetS3Copy(
copyData,
file"/media/ephemeral0/data/ncbiTaxonomy/"
)() {

val nodes = baseDirectory / fileNames.nodes
val names = baseDirectory / fileNames.names
}

}
35 changes: 35 additions & 0 deletions src/main/scala/uniprot/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.bio4j.data.uniprot

import com.bio4j.data.bundles._
import java.net.URL
import better.files._

case object bundles {

// NOTE: only old releases have a date-tag
val release = "current_release"

case object fileNames {
val sprot = "uniprot_sprot.dat" // 517MB gz
val trembl = "uniprot_trembl.dat" // 38.9GB gz
val varsplic = "uniprot_sprot_varsplic.fasta" // 7.7MB gz
}

// TODO: probably it's better to make 3 separate data and import bundles
case object rawData extends GetRawData(
urls = Seq(
fileNames.sprot,
fileNames.trembl
).map { suffix =>
new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/knowledgebase/complete/${suffix}.gz")
},
baseDirectory = file"/media/ephemeral0/data/enzyme/",
gunzip = true
)() {

val sprot = baseDirectory / fileNames.sprot
val trembl = baseDirectory / fileNames.trembl
val varsplic = baseDirectory / fileNames.varsplic
}

}
36 changes: 36 additions & 0 deletions src/main/scala/uniref/bundles.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.bio4j.data.uniref

import com.bio4j.data.bundles._
import java.net.URL
import better.files._

case object bundles {

// NOTE: only old releases have a date-tag
val release = "current_release"

case object fileNames {
val uniref50 = "uniref50.xml" // 8.5GB gz
val uniref90 = "uniref90.xml" // 15.4GB gz
val uniref100 = "uniref100.xml" // 27.7GB gz
}

// TODO: probably it's better to make 3 separate data and import bundles
case object rawData extends GetRawData(
urls = Seq(
fileNames.uniref50,
fileNames.uniref90,
fileNames.uniref100
).map { suffix =>
new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/uniref/${suffix}/${suffix}.gz")
},
baseDirectory = file"/media/ephemeral0/data/enzyme/",
gunzip = true
)() {

val uniref50 = baseDirectory / fileNames.uniref50
val uniref90 = baseDirectory / fileNames.uniref90
val uniref100 = baseDirectory / fileNames.uniref100
}

}