From 46e76b148ac09e1f3e6da7ee6e6e3db045f68840 Mon Sep 17 00:00:00 2001 From: Knut Finstermeier Date: Tue, 12 May 2026 09:10:26 +0200 Subject: [PATCH] feat: removed NCBI access from lesson 8 --- lessons/lesson_08.ipynb | 96 ----------------------------------------- 1 file changed, 96 deletions(-) diff --git a/lessons/lesson_08.ipynb b/lessons/lesson_08.ipynb index 5301d5e..bebf1ab 100644 --- a/lessons/lesson_08.ipynb +++ b/lessons/lesson_08.ipynb @@ -301,102 +301,6 @@ " break" ] }, - { - "cell_type": "markdown", - "id": "0fc324f7", - "metadata": {}, - "source": [ - "# Accessing NCBI\n", - "- WARNING: NCBI can and will block your institute, if you misuse this, even unintentionally, but they will try to reach you first.\n", - "- NCBI guidelines: https://www.ncbi.nlm.nih.gov/books/NBK25497/\n", - "- In brief:\n", - " - For more than 100 consecutive requests in a row, do that at a weekend or outside of US peak times\n", - " - No more than 3 queries per second\n", - " - **IMPORTANT**: always provide a valid email address for contacting you! \n", - " - Entrez allows you to search and download large amounts of data. Save those files rather than downloading them over and over again!\n", - " - Use alternatives to download large datasets, i.e. ftp server for all bacterial genomes etc.\n" - ] - }, - { - "cell_type": "markdown", - "id": "a65f491c", - "metadata": {}, - "source": [ - "- Allows programmatic database search similar to website\n", - "- Multiple databases are supported, i.e. nucleotide, pubmed\n", - "- Will return identifiers for downloading / accessing data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d52bf4d", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio import SeqIO\n", - "from Bio import Entrez\n", - "\n", - "Entrez.email = \"finstermeier@mpusp.mpg.de\"\n", - "handle = Entrez.esearch(db=\"pubmed\", term=\"biopython\")\n", - "record = Entrez.read(handle)\n", - "print(record[\"IdList\"][:5])" - ] - }, - { - "cell_type": "markdown", - "id": "e428202c", - "metadata": {}, - "source": [ - "- Available databases: https://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.T._entrez_unique_identifiers_ui/?report=objectonly\n", - "- Keywords for search term can be found here: https://www.ncbi.nlm.nih.gov/books/NBK49540/\n", - "- Overview of available databases and parameters:\n", - " - https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d94519b6", - "metadata": {}, - "outputs": [], - "source": [ - "handle = Entrez.esearch(\n", - " db=\"nucleotide\", term=\"Cypripedioideae[Orgn] AND matK[Gene]\", idtype=\"acc\"\n", - ")\n", - "record = Entrez.read(handle)\n", - "print(record[\"IdList\"][:5])" - ] - }, - { - "cell_type": "markdown", - "id": "4fd85ecd", - "metadata": {}, - "source": [ - "# Downloading from NCBI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d794ccd", - "metadata": {}, - "outputs": [], - "source": [ - "with Entrez.efetch(\n", - " db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291\"\n", - ") as handle:\n", - " seq_record = SeqIO.read(handle, \"gb\") # using \"gb\" as an alias for \"genbank\"\n", - " print(f\"{seq_record.id} with {len(seq_record.features)} features\")\n", - "\n", - "\n", - "with Entrez.efetch(\n", - " db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291, 6273290, 6273289\"\n", - ") as handle:\n", - " for seq_record in SeqIO.parse(handle, \"gb\"):\n", - " print(f\"{seq_record.id}: {seq_record.description[:50]}...\")" - ] - }, { "cell_type": "markdown", "id": "0642bc14",