From b7758737cf839abd0b27688744a157ed12fba85f Mon Sep 17 00:00:00 2001 From: m-jahn Date: Thu, 23 Apr 2026 08:34:14 +0200 Subject: [PATCH 1/5] fix: removed duplicated course material from solutions --- solutions/solutions_01.ipynb | 563 +----------------- solutions/solutions_02.ipynb | 708 ----------------------- solutions/solutions_03.ipynb | 419 +------------- solutions/solutions_04.ipynb | 668 +-------------------- solutions/solutions_05.ipynb | 22 +- solutions/solutions_06.ipynb | 440 +------------- solutions/solutions_07.ipynb | 1060 +--------------------------------- solutions/solutions_08.ipynb | 874 +--------------------------- 8 files changed, 82 insertions(+), 4672 deletions(-) diff --git a/solutions/solutions_01.ipynb b/solutions/solutions_01.ipynb index c89981d..cc45bbd 100644 --- a/solutions/solutions_01.ipynb +++ b/solutions/solutions_01.ipynb @@ -2,571 +2,10 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Basics of Bioinformatics with Python and R\n", - "\n", - "An interactive course in 10 units for the members of the [MPUSP](https://www.mpusp.mpg.de) (Max-Planck-Unit for the Science of Pathogens).\n", - "\n", - "All course materials are collected in a central Github repository, https://github.com/MPUSP/python_course/. Issues can be reported [here](https://github.com/MPUSP/python_course/issues).\n", - "\n", - "For comments, criticism, and general feedback please contact the **authors** at `bioinformatics@mpusp.mpg.de`.\n", - "\n", - "Course authors and maintainers are:\n", - "\n", - "- [Knut Finstermeier](https://github.com/finstermeier), email finstermeier@mpusp.mpg.de\n", - "- [Rina Ahmed-Begrich](https://github.com/rabioinf), email begrich@mpusp.mpg.de\n", - "- [Michael Jahn](https://github.com/m-jahn), email jahn@mpusp.mpg.de\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "# Lesson 01: Introduction\n", - "\n", - "## Course materials and structure\n", - "\n", - "- all course participants can be contacted via the mailing list `python_2026@mpusp.mpg.de`\n", - "- all materials are available at our Github repository: https://github.com/MPUSP/python_course/ (currently private)\n", - "- each lesson is provided as a Jupyter notebook (`.ipynb`), or alternatively a Quarto (`.qmd`) or R markdown (`.Rmd`) notebook\n", - "- the course material is a blend of own works examples from Justin Bois from Caltech (http://bois.caltech.edu/)\n", - "- each lesson is supposed to be worked through in about 90 minutes\n", - "- the course is **interactive (!)**: we will mix small introductory lectures with hands-on exercises\n", - "- participants are expected to actively work through the notebooks, run code cells, and try to understand/solve the given tasks\n", - "- if we run out of time or can't complete a lesson, please try to finish it at home\n", - "- the course is designed for beginners, prior experience with programming is helpful but not required!\n", - "- you can ask questions at any time!\n", - "- and finally and most importantly: **the course is meant to be fun!** 😊\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Python and R\n", - "\n", - "- we will feature both Python and R programming languages\n", - "- both languages are wildly popular in bioinformatics and data science\n", - "\n", - "Here is a quick comparison of both languages:\n", - "\n", - "| Feature | Python | R |\n", - "| --------------------------- | ----------------- | ---------------------- |\n", - "| General purpose language | Yes | No (statistics) |\n", - "| Year of release | 1991 | 1993 |\n", - "| Readability | High | High |\n", - "| Object-oriented | Yes | Yes |\n", - "| Cost / License | Free / Permissive | Free / Permissive |\n", - "| Execution | Interpreter | Interpreter |\n", - "| Packages | Pypi: 715k | CRAN / Biocond. (250k) |\n", - "| Popularity on Github (2025) | 1st | 33rd |\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## What can we do with Python?\n", - "\n", - "- Data analytics\n", - "- Machine learing\n", - "- Mobile Apps\n", - "- Graphical user interfaces\n", - "- Web frameworks\n", - "- Multimedia\n", - "- Databases\n", - "- Networking\n", - "- Automation\n", - "- Scientific computing\n", - "- Text processing\n", - "- Image processing\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Jupyter Notebooks\n", - "\n", - "- interactive documents that **combine code, text, and visualizations**\n", - "- widely used in data science, machine learning, and scientific computing\n", - "- great for experimentation, reporting and sharing\n", - "- run individual cells and see the result immediately\n", - "- notebooks are not executable programs, they are just code/documentation collections\n", - "- Quarto and R markdown notebooks are similar alternatives\n", - "- All (Jupyter) notebooks can be rendered to HTML and PDF, perfect for documentation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Jupyter Lab\n", - "\n", - "- web-based interface for Jupyter notebooks\n", - "- we don't need to install anything locally\n", - "- course materials are already pre-installed on the GWDG JupyterHub server (if everything works out fine)\n", - "- otherwise we will instruct you how to obtain the materials\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "### GWDG Cocalc server\n", - "\n", - "https://cocalc.gwdg.de/\n", - "\n", - "- HU: use your single sign on (SSO) account.\n", - "- MPG: register at GWDG to get access\n", - "\n", - "Documentation:\n", - "\n", - "https://doc.cocalc.com/teaching-students.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with 'cells' in Jupyter\n", - "\n", - "- cells or 'code chunks' are the building blocks of notebooks\n", - "- each cell can contain code or text (Markdown)\n", - "- code cells can be executed individually, but **in the correct order: top to bottom (!)**\n", - "- code cells can be defined to run different languages (Python, R, Bash, etc.)\n", - "- outputs (text, plots, tables, etc.) are displayed directly below the code cell\n", - "- Output is stored in the notebook until the kernel is restarted or the output is cleared. Thus, outputs from previously run cells may still be visible even if the code that generated them has been changed or deleted.\n", - "\n", - "### Examples\n", - "\n", - "- observe how we can print text ('strings'), calculate, test conditions, or assign variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Code comments can be added with a hash sign (#)\n", - "# This is a comment and will not be executed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Hello World!\") # Code before the hash sign will be executed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "123 + 456" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "1 == 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = 3\n", - "b = 5\n", - "a + b**2 # this means to the power of 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Python basics\n", - "\n", - "- the following basic rules apply to Python and many other programming languages\n", - "\n", - "### Code conventions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- code is **being executed line by line**, and later statements override previous ones" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = 2\n", - "a = 5\n", - "print(a)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- python uses **indentation** to define code blocks (function bodies, loops, conditionals, etc.)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if a > 3:\n", - " print(\"'a' is greater than 3\")\n", - "else:\n", - " print(\"'a' is less than or equal to 3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Variables\n", - "\n", - "- Flexible data types\n", - "- Container to store value(s)\n", - "- assignment via `=`\n", - "\n", - "### Common **single value** (atomic) data types\n", - "\n", - "| Data type | Example | Meaning |\n", - "| :-------- | :---------------------- | :------------------------------------------------- |\n", - "| bool | val = True, val = False | Binary, has only two states |\n", - "| int | val = 42 | Integer, whole number \\-∞ < val < ∞ |\n", - "| float | val = 3.1415 | Float point, decimal number \\-∞ < val < ∞ |\n", - "| str | val = β€œHello World!” | Strings, all kind of text, always surrounded by β€œ ” or β€˜ ’ or ''' ''' or \"\"\" \"\"\"|\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `bool`eans are used for conditional statements, e.g. to control the flow of a program\n", - "- note how we can use variables and **dynamically formatted strings (\"f-strings\")**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = \"banana\"\n", - "b_word = a.startswith(\"b\")\n", - "\n", - "if b_word:\n", - " print(f\"The word {a} starts with 'b'\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `int` and `float` are used for mathematical operations\n", - "- They can be combined in expressions, and the result will be a `float` if any operand is a `float` (coercion)\n", - "- Python handles data type assignments for you" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = 2 # int\n", - "b = 3.14 # float\n", - "\n", - "print(\"a is of type\", type(a))\n", - "print(\"b is of type\", type(b))\n", - "print(\"a+a result is of\", type(a + a))\n", - "print(\"a+b result is of\", type(a + b))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `str`ings are very important and versatile data types (in bioinformatics)\n", - "- `str`ings can be manipulated with many built-in methods\n", - "- `str`ing variables are automatically detected by the surrounding quotes\n", - "- numbers (and other `type`s) can be converted (coerced) to strings with the `str()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a = \"my_string\"\n", - "b = \"another string\"\n", - "c = 47\n", - "d = str(c) # convert integer to string\n", - "\n", - "print(a + \" and \" + b)\n", - "print(f\"Combine {a} and {c}\")\n", - "print(type(d))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Common **multi-value** (composite) data types\n", - "\n", - "| Data type | Example | Meaning |\n", - "| :-------- | :-------------------- | :------------------------------------------------------------- |\n", - "| list | val = \\[1, 2, 3\\] | mutable, ordered list of (mixed) things |\n", - "| tuple | val = (1, 2, 3) | Immutable, ordered list of (mixed) things |\n", - "| set | val = {} | mutable, unordered list of (unique) things |\n", - "| dict | val = {”a”:10,”b”:20.5} | mutable, unordered association of key-value pairs (dictionary) |\n", - "\n", - "*mutable: can be partially and completely changed after creation\n", - "\n", - "*immutable: cannot be changed after creation\n", - "\n", - "*unordered: items have no defined order when accessing them" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- lists, tuples, sets, and dictionaries can store multiple values and of different data types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_num = [1, 2, 10, 127]\n", - "\n", - "print(\"List contents:\", list_num)\n", - "print(\"First element:\", list_num[0])\n", - "print(\"Last element:\", list_num[-1])\n", - "print(\"Length of list:\", len(list_num))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- lists can be accessed via indexing and slicing\n", - "- indexing = single position access\n", - "- slicing = range access\n", - "- **Important:** lists are zero-indexed, i.e. the first item is at position 0\n", - "- **off by one errors** are one of the most common errors when working with indexing/slicing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Note how we are using a list defined above in a prior Jupyter cell!\n", - "print(list_num[2]) # indexing\n", - "print(list_num[1:3]) # slicing\n", - "# Note how we are using a list defined above in a prior Jupyter cell!\\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- list elements can be added, removed, or modified\n", - "- the short hand `+=` can be used to append elements to a list (instead of `a = a + [...]`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_num += [256, 512]\n", - "print(\"Updated list contents:\", list_num)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `dict`s store key-value pairs, accessed via keys\n", - "- they are extremely important and popular in the python world" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fruits = {\"apple\": 1, \"banana\": 2, \"orange\": 3}\n", - "fruits[\"banana\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- will this work? We mix data types `int` and `str` in a single `dict`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fruits[\"mango\"] = 4\n", - "fruits[\"starfruit\"] = \"sold out\"\n", - "print(fruits)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- more about sets, dicts, and other useful data types will follow later" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Naming conventions\n", - "\n", - "- A variable must start with a letter, not a number\n", - "- Case sensitive, no white spaces, but prefer **snake_case**\n", - "- Use descriptive names, avoid single letters (`x`, `y`, `z`) for important variables\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# bad naming\n", - "MySUPERComplexMixedCaseVar = 10\n", - "my_var = 20\n", - "m = 30\n", - "\n", - "# good naming\n", - "chain_length = 100\n", - "num_sequences = 50" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Coding style\n", - "\n", - "- as always in life, **style matters** :-)\n", - "- good coding style improves readability and maintainability **for yourself and others**\n", - "- **imports** are usually placed at the beginning of a script\n", - "- functions are lower case `def my_function(): ...`\n", - "- keep lines shorter than 80 characters\n", - "- use **white space** around mathmatical operators, i.e. `results = 1 + 2` and not `results=1+2`\n", - "- avoid white space in function parameters `f(x, y=4)`\n", - "- use comments to explain non-obvious code parts\n", - "- use built-in code formatters (e.g. `black`, `prettier`), that will take care of styling and are included in modern IDEs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Example showing different code elements**\n", - "\n", - "- we import `numpy`, a powerful library for numerical computing in Python\n", - "- we import `matplotlib` for basic data visualization" - ] - }, - { - "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", + "# Lesson 01\n", "\n", - "# series of numbers defined by start, stop, step\n", - "x = np.arange(0, 4*np.pi, 0.1)\n", - "y = np.sin(x)\n", - "plt.plot(x, y)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "## Exercises\n", "\n", "### Code conventions\n", diff --git a/solutions/solutions_02.ipynb b/solutions/solutions_02.ipynb index 5e2b5de..cfdc425 100644 --- a/solutions/solutions_02.ipynb +++ b/solutions/solutions_02.ipynb @@ -6,714 +6,6 @@ "source": [ "# Lesson 02\n", "\n", - "In this lesson, we will learn more about:\n", - "\n", - "- data types and conversions\n", - "- operators to work with combinations of vars\n", - "- strings, their formatting and manipulation\n", - "\n", - "## Recap of previous lecture\n", - "\n", - "- we learned about variables and assignments: `x = 5`, `name = \"Alice\"`\n", - "- we saw different data types: integers `5`, floats `3.14`, strings `\"hello\"`, lists `[1, 2, 3]`\n", - "- we practiced printing values using `print()`, correct commenting (`#`), and indentation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - }, - "source": [ - "## Data type conversions\n", - "\n", - "- data types can be converted into each other if compatible\n", - "- data types for variables do not need to be defined when initializing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "number = 123\n", - "print(number, type(number))\n", - "\n", - "number = str(number)\n", - "print(number, type(number))\n", - "\n", - "number = float(number)\n", - "print(number, type(number))\n", - "\n", - "number = int(number)\n", - "print(number, type(number))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- will this fail or not?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "number = int(\"not a number, but a string\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Operators\n", - "\n", - "- used for interactions between two entities\n", - "\n", - "| Type | Operator |\n", - "| ------------------------------ | :------: |\n", - "| addition | `+` |\n", - "| subtraction | `-` |\n", - "| multiplication | `*` |\n", - "| division | `/` |\n", - "| raise to power | `**` |\n", - "| modulo (remainder) | `%` |\n", - "| floor division (whole divider) | `//` |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(1 + 1, 2 * 3, 2**4, 7 / 2, 7 // 2, 7 % 2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "print(3.3 + 5, 9**0.5, 3 * 1.0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- _some_ operators can also be used for non-numeric types, e.g. strings\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"result: \" + str(3 + 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- multiplication of strings can have an unexpected effect" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"test\" * 3" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- as we learned in the previous lesson, python knows short cuts to avoid laborious re-assignments\n", - "- it is possible to shortcut formulas to use a more comprehensive way of coding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "val = 5\n", - "print(val)\n", - "\n", - "val += 5\n", - "print(val)\n", - "\n", - "val **= 2\n", - "print(val)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Strings\n", - "\n", - "- Strings are text (or `character` in R), signified by the `str` type\n", - "- Strings are indicated by single `'`, double `\"`, or triple quotes `'''`\n", - "- String content is independend of used quotes\n", - "- Only triple qoutes include multiple lines (!)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Indexing and slicing\n", - "\n", - "- python has very good built-in support to work with strings, such as:\n", - "- _Indexing_ – referencing position(s) in a string\n", - "- _Slicing_ – extract ranges of a string\n", - "- note: **counting starts with 0** (and not with 1, unlike R)!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var = \"Hello world.\"\n", - "\n", - "print(var)\n", - "print(var[0])\n", - "print(var[1])\n", - "print(var[2:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- here is a more systematic example to show how python uses **ranges** (indicated by colon `:`)\n", - "- note that the **last position is excluded**\n", - "- open ranges mean \"from beginning\" / \"until end\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var = \"123456789\"\n", - "\n", - "print(var[1:3])\n", - "print(var[:3])\n", - "print(var[6:])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- to start counting from the end, use negative numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(var[-1])\n", - "print(var[-2])\n", - "print(var[-5:-2])\n", - "print(var[-5:])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "- strings can be combined by the `+` operator\n", - "- strings can also be combined from lists using the built-in `join` method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var1 = \"Hello\"\n", - "var2 = \"world!\"\n", - "\n", - "# print\n", - "print(var1, var2)\n", - "\n", - "# concatenate\n", - "print(var1 + \" \" + var2)\n", - "\n", - "# join\n", - "\" and \".join([var1, var2])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## String formatting\n", - "\n", - "### 'White space'\n", - "\n", - "- a general term for all non-printable characters\n", - "- line break: `\\n` splits string into 2 lines\n", - "- tab: `\\t` introduces a tabulator white space" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Hello world.\")\n", - "print(\"Hello\\nworld.\")\n", - "print(\"Hello\\tworld.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### Building strings with variables\n", - "\n", - "#### f-strings\n", - "\n", - "- one of the **great strengths** of python is its simplicity to build more complex strings from variables\n", - "- variables can be combined as long as their `type` matches\n", - "- `\"abc\" + 123` does *not* work\n", - "- but, almost every datatype can be converted into a string for printing and concatenation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(str(7))\n", - "print(str(3.14))\n", - "print(str([2.5, 3, \"abc\"]))\n", - "print(str(True))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- python has different options to add variables to strings\n", - "- most convenient: **f-strings**, have an `f` placed directly before the string\n", - " - content like variables is added in-place with curly brackets `{var}`\n", - " - variables are automatically coerced to `str`, no extra string coercion required\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var1 = 1969\n", - "var2 = \"20\"\n", - "var3 = \"July\"\n", - "\n", - "print(f\"Neil Armstrong landed on the moon at {var1}, {var2} of {var3}.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- we can also make in place calculations within f-strings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Neil Armstrong did not land on the moon in {var1 + 3}.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "- additional string formatting is possible by using a colon `:` and a parameter within the curly brackets `{` and `}`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\n", - " f\"The {5:03}th Chinese lunar explorer mission Chang'e landed on the moon on {'December'} {1:02}, {2000+20}.\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- floats can be formatted to a specific number of decimal places\n", - "- use the notation `.Nf`, where `N` is the number of decimal places.\n", - "- the total number of digits is specified by the number before the decimal point." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "pi = 3.141592653589793\n", - "print(f\"Pi starts like this: {pi:.4f}\")\n", - "print(f\"Pi starts like this: {pi:08.4f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Place holders\n", - "\n", - "- an alternative to f-strings are place holders\n", - "- use `%` followed by a letter giving the input `type`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| code | meaning |\n", - "| :--: | :-----------------------------------------------: |\n", - "| `%s` | string |\n", - "| `%i` | integer |\n", - "| `%d` | decimal |\n", - "| `%f` | float |\n", - "| `%%` | overwrites placeholder function and returns a \"%\" |\n", - "\n", - "(see https://docs.python.org/3/library/stdtypes.html#old-string-formatting for more)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Neil Armstrong landed on the moon in %i.\" % (1969))\n", - "print(\"Neil Armstrong landed on the moon on %s %i, %i.\" % (\"July\", 20, 1969))\n", - "print(\"Percentages are printed correctly in %i%% of all cases.\" % (99))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## String operations\n", - "\n", - "### Replacements\n", - "\n", - "- python has **powerful built-in** functions to manipulate strings\n", - "- built-in functions are invoked directly on the string using the dot `.()` notation\n", - "- to replace a string with another string, use for example `str.replace(, )`\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var = \"Bananas are awesome!\"\n", - "print(var.replace(\"Bananas\", \"Apples\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- add an optional third argument `` to limit the number of replacements\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "var = \"Ask not what your country can do for you; ask what you can do for your country.\"\n", - "print(var.replace(\"country\", \"nation\"))\n", - "print(var.replace(\"country\", \"nation\", 1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Finding substrings\n", - "\n", - "- find a substring within another string starting from left side: `str.find()`\n", - "- returns index of appearance relative to full string start\n", - "- returns -1 if absent\n", - "- limit search to a specific range with optional start/end arguments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq = 'GTACCTTGATTTCGTAA'\n", - "\n", - "print(seq.find('GTA'))\n", - "print(seq.find('CCT'))\n", - "print(seq.find('GTA', 1))\n", - "print(seq.find('GTA', 1, 10))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- likewise `.rfind()` finds the first appearance starting at the right side" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(seq.rfind('TTT'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- count appearances of a substring with `str.count()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq.count('TA')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### String manipulations\n", - "\n", - "- change case of a string with `str.upper()`, `str.lower()`, `str.swapcase()`\n", - "- `str.capitalize()` changes only the first letter to uppercase and the rest to lowercase" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq = \"GTAccTTgAttTcGTaa\"\n", - "print(seq.upper())\n", - "print(seq.lower())\n", - "print(seq.swapcase())\n", - "print(seq.capitalize())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- prune strings by removing leading and trailing characters with `str.strip()`, `str.lstrip()`, `str.rstrip()`\n", - "- prune strings with exact matches using `str.removeprefix()`, `str.removesuffix()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(seq.strip(\"T\")) # removes nothing\n", - "print(seq.lstrip(\"TAGC\")) # removes all leading letters matching the query\n", - "print(seq.rstrip(\"tagc\")) # removes all trailing letters matching the query\n", - "print(seq.removeprefix(\"GtAc\")) # removes exact prefix\n", - "print(seq.removesuffix(\"GTaa\")) # removes exact suffix" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- split strings using `str.split()`, which returns a list of the parts\n", - "- again, split accepts a second numeric argument defining how many times to split\n", - "- grab the desired part by using the list index `[]`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq = \"ACGT_GGCC_TGGA\"\n", - "print(seq.split(\"_\"))\n", - "print(seq.split(\"_\", 1))\n", - "print(seq.split(\"_\", 1)[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### String testing\n", - "\n", - "- python has many built-in functions to test for specific string properties\n", - "- test if a string starts or ends with a specific substring using `str.startswith()`, `str.endswith()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(seq.startswith(\"GTA\"))\n", - "print(seq.endswith(\"GTA\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- test properties of strings using the familyt of `str.is()` functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(seq.islower())\n", - "print(seq.isupper())\n", - "print(seq.isalpha())\n", - "print(seq.isascii())\n", - "print(seq.isdigit())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- get the lenghth of a string using `len()`\n", - "- we have seen this previously with lists -- this generic function works for most objects" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Sequence has length:\", len(seq))\n", - "print(\"Sequence without underscores has length:\", len(seq.replace(\"_\", \"\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "## Exercises\n", "\n", "### Operators\n", diff --git a/solutions/solutions_03.ipynb b/solutions/solutions_03.ipynb index 7b706fc..de9e14d 100644 --- a/solutions/solutions_03.ipynb +++ b/solutions/solutions_03.ipynb @@ -4,425 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Lesson 3\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "In this lesson, we will learn more about:\n", - "\n", - "- logical operators (`==`, `!=`, `>`, ...)\n", - "- comparisons and conditions\n", - "- program control with `if`-`else` statements\n", - "- functions, the work horse of programming\n", - "\n", - "## Recap of previous lecture\n", - "\n", - "- we learned about operators (`+`, `-`, `*`, `/`, `//`, `%`)\n", - "- we worked a lot with strings: `find`, `replace`, slice (`str[2:4]`), split (`str.split()`), determine length (`len(str)`)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## Relational operators\n", - "\n", - "- to make decisions in programs and control the flow of execution, we use relational operators\n", - "- they compare two entities\n", - "- statements for comparisons:\n", - "\n", - "| statement | meaning |\n", - "| :-------: | :-----------------: |\n", - "| \\== | equal |\n", - "| \\> | greater than |\n", - "| < | smaller than |\n", - "| \\>= | greater or equal to |\n", - "| <= | smaller or equal to |\n", - "| != | unequal |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "print(3 == 3)\n", - "print(\"a\" == \"a\")\n", - "print(3 > 4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "outputs": [], - "source": [ - "print(4 != 5)\n", - "print(3 + 2 == 5)\n", - "print(2 == 2.00)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "#### Chaining relational operators\n", - "\n", - "- multiple comparisons can be combined in a single expression\n", - "- each comparison is tested pairwise\n", - "- stay reasonal and keep it readable (or you will end in hell when troubleshooting)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "val = 3\n", - "print(2 < val < 4)\n", - "print(3 < 4 < 5 < 6)\n", - "print(5 >= 4 != \"a\")\n", - "print(4 < 6 > 2 <= 2.1 != 2 + 2 == 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "## if-else statements\n", - "\n", - "- if-else statements are essential parts of every programming language\n", - "- their purpose is to test conditions and trigger different actions based on the result\n", - "\n", - "### `if`-statement\n", - "\n", - "- syntax: `if` condition evaluates to `True` `:` then do something\n", - "- `else:` do something else\n", - "- the body of the `if` and `else` statement is indented (4 spaces `' '`)!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if 3 + 3 == 6:\n", - " print(\"Result is correct.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq = \"GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCC\"\n", - "\n", - "if seq.count(\"TTT\"):\n", - " print(f\"TTT found at position {1 + seq.find('TTT')}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### `elif` statement (else if)\n", - "\n", - "- optional, additional test(s) for evaluation\n", - "- requires an `if` statement before, is only evaluated if the `if` condition is False\n", - "- can be an endless number of tests\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if seq.count(\"ATG\"):\n", - " print(\"Found primary start codon ATG.\")\n", - "elif seq.count(\"GTG\"):\n", - " print(\"Found secondary start codon GTG.\")\n", - "elif seq.count(\"TTG\"):\n", - " print(\"Found tertiary start codon TTG.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "slide" - } - }, - "source": [ - "### `else` statement\n", + "# Lesson 03\n", "\n", - "- lowest rank of tests for evaluation\n", - "- requires an `if` or `if`+`elif` statement before\n", - "- only once per `if` statement\n", - "- note the use of `not` to negate a condition test (evaluation changes from True to False and vice versa)\n", - "- in python, integers `1` and `0` are synonyms for booleans `True` and `False`, respectively\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not len(seq) % 3:\n", - " print(\"Sequence can be divided by 3, and hence can be translated.\")\n", - "elif len(seq) % 3 == 2:\n", - " print(\"Sequence cannot be divided by 3, 2 extra nucleotides present.\")\n", - "else:\n", - " print(\"Sequence cannot be divided by 3, 1 extra nucleotide present.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- if-conditions can be nested infinitely, but deep testing is discouraged for readability\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "codon = \"TTG\"\n", - "if seq.count(codon):\n", - " if seq.find(codon) == 0:\n", - " print(f\"Start codon {codon} is at the beginning of the sequence.\")\n", - " else:\n", - " if seq.find(codon) < 10:\n", - " print(f\"Start codon {codon} is within the first 10 nucleotides.\")\n", - " else:\n", - " print(f\"Start codon {codon} is beyond the first 10 nucleotides.\")\n", - "else:\n", - " print(f\"Found no start {codon} codon.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Functions\n", - "\n", - "- functions are **reusable, confined chunks** of code that perform a specific task\n", - "- functions are the **work horse** of programming, taking _defined inputs_ and producing _defined outputs_\n", - "- they help to modularize code, improve readability, and avoid redundancy\n", - "- **rule of thumb**: if you find yourself writing the same code more than twice, wrap it in a function\n", - "\n", - "### Syntax:\n", - "\n", - "```python\n", - "def function_name(input):\n", - " \"\"\"Docstring: Optional description of the function.\"\"\"\n", - " output = \n", - " return output\n", - "```\n", - "\n", - "Examples:\n", - "\n", - "- simple function that returns length of a sequence\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_length(seq):\n", - " \"\"\"Returns the length of the given sequence.\"\"\"\n", - " seq_no_whitespace = seq.strip()\n", - " return len(seq_no_whitespace)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "get_length(\" ATGCGTACGTA \")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- little larger function that prints a sequence summary\n", - "- function does not return a sensible output, just prints to terminal\n", - "- takes two input arguments, one with a default value which does not need to be provided\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def summary_seq(sequence, first_nt=10):\n", - " \"\"\"Prints a summary of the given DNA sequence.\"\"\"\n", - " seq_len = len(sequence)\n", - " print(\"Sequence length:\", seq_len)\n", - " print(f\"First {first_nt} nucleotide(s):\", sequence[:first_nt])\n", - " if not seq_len % 3:\n", - " print(f\"Sequence is translatable, having {seq_len / 3} codons\")\n", - " else:\n", - " print(f\"Sequence is not translatable, having {seq_len % 3} extra nucleotide(s)\")\n", - " return True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "summary_seq(\"ATGGTACCGTGCCGACGATGCATGTACGC\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Function arguments\n", - "\n", - "- by convention, functions should only operate on their inputs\n", - "- input, and newly created variables are contained within the function (called the function's **_scope_**) and can not be accessed from outside\n", - "- however, in python, functions can access and modify global variables outside their scope (use with care!)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def sort_list(l):\n", - " l.sort()\n", - " print(\"string sorted\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- function modifies the input object even without re-assignment\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l = [\"c\", \"d\", \"a\", \"b\"]\n", - "sort_list(l)\n", - "print(l)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- we can also pass function arguments as `dict` instead of manually listing them\n", - "- the dict keys must match the function argument names, and the **dict is unpacked** with `**`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def repeat_str(string, times):\n", - " return string * times" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "args = {\"string\": \"ATG\", \"times\": 5}\n", - "repeat_str(**args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- functions can even take variable previously undefined arguments\n", - "- `**kwargs` reads 'key-worded arguments'\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def var_input(**kwargs):\n", - " for i, j in kwargs.items():\n", - " print(i, j)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "args = {\"a\": 1, \"b\": 2}\n", - "var_input(**args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "## Exercises\n", "\n", "### Comparisons and Conditions\n" diff --git a/solutions/solutions_04.ipynb b/solutions/solutions_04.ipynb index 54e6621..3a05b1a 100644 --- a/solutions/solutions_04.ipynb +++ b/solutions/solutions_04.ipynb @@ -1,649 +1,13 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "0db153c1", - "metadata": {}, - "source": [ - "# Lesson 4" - ] - }, - { - "cell_type": "markdown", - "id": "0723565c", - "metadata": {}, - "source": [ - "In this lesson, we will learn more about:\n", - "\n", - "- lists and tuples\n", - "- built-in function len()\n", - "- built-in function range()\n", - "- built-in function enumerate()\n", - "- loops (`for` and `while`)\n", - "\n", - "## Recap of previous lecture\n", - "\n", - "- logical operators (`==`, `!=`, `>`, ...)\n", - "- comparisons and conditions\n", - "- program control with `if`-`else` statements\n", - "- functions, the work horse of programming\n" - ] - }, - { - "cell_type": "markdown", - "id": "6ae6c917", - "metadata": {}, - "source": [ - "# Lists\n", - "- an ordered sequence of elements\n", - "- defined and created by squared brackets `[` and `]`\n", - "- lists can contain anything, even mixed types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78a01e99", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4]\n", - "print(my_list)" - ] - }, - { - "cell_type": "markdown", - "id": "f552c7d8", - "metadata": {}, - "source": [ - "- lists can be nested (i.e. a list in a list in a list)\n", - "- lists can be combined and mulitplied" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10be5ef6", - "metadata": {}, - "outputs": [], - "source": [ - "print(my_list * 3)\n", - "my_nested_list = [[1, 2, 3], [3, 2, [1]], 'a', 'b', 'c', 'a', 'a', 'a']\n", - "print(my_nested_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9775a8e", - "metadata": {}, - "outputs": [], - "source": [ - "my_nested_list += my_list\n", - "print(my_nested_list)" - ] - }, - { - "cell_type": "markdown", - "id": "b3542cc2", - "metadata": {}, - "source": [ - "- lists can be indexed (as shown before for strings)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "631dd417", - "metadata": {}, - "outputs": [], - "source": [ - "print(my_nested_list)\n", - "print(my_nested_list[0])\n", - "print(my_nested_list[-4: ])" - ] - }, - { - "cell_type": "markdown", - "id": "2536de8a", - "metadata": {}, - "source": [ - "# Accessing list elements\n", - "- list elements can be extracted based on multiples of their index by specifying a third index number N `[x:y:N]` or `[::N]`\n", - "- this is called a stride" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51b0a3b4", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", - "print(my_list)\n", - "print(my_list[:5])\n", - "print(my_list[::2])\n", - "print(my_list[::3])" - ] - }, - { - "cell_type": "markdown", - "id": "63a37a1e", - "metadata": {}, - "source": [ - "- invert a list with the same index operator `[::-1]`\n", - "- same rules apply as for positive numbers (extracting the multiple of an index after inverting it)" - ] - }, - { - "cell_type": "markdown", - "id": "74b89503", - "metadata": {}, - "source": [ - "# List operations\n", - "- `.append()` adds an element at the end of a list\n", - "- `.insert()` inserts an element before the provided index\n", - "- `.pop()` removes the element at the provided index and returns it\n", - "- `.remove()` removes the element matching first from left and does **NOT** return it, raises error when nothing is found\n", - "\n", - "\n", - "- all of the above functions modify the list in place and do not create a copy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4622cf65", - "metadata": {}, - "outputs": [], - "source": [ - "list1 = [1, 2, 3, 4, 5]\n", - "print(list1, '\\n')\n", - "\n", - "list1.append(6)\n", - "print(list1, '\\n')\n", - "\n", - "list1.insert(3, 'a')\n", - "print(list1, '\\n')\n", - "\n", - "print(list1.pop(3))\n", - "print(list1, '\\n')\n", - "\n", - "print(list1.remove(6))\n", - "print(list1)" - ] - }, - { - "cell_type": "markdown", - "id": "91f907f1", - "metadata": {}, - "source": [ - "- `.sort()` sorts list (ascending) in place (except with parameter reverse=True)\n", - "- `.reverse()` inverts a list in place\n", - " \n", - "\n", - "- separate function:\n", - "- `sorted()` creates a sorted (ascending) copy of a list (except with parameter reverse=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59d6ece0", - "metadata": {}, - "outputs": [], - "source": [ - "list1 = [1, 4, 5, 6, 2, 3]\n", - "print(list1, '\\n')\n", - "\n", - "list1.sort()\n", - "print(list1, '\\n')\n", - "\n", - "list1.reverse()\n", - "print(list1, '\\n')\n", - "\n", - "list1 = [1, 4, 5, 6, 2, 3]\n", - "print(list1)\n", - "list2 = sorted(list1, reverse=True)\n", - "print(list1)\n", - "print(list2)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2d386131", - "metadata": {}, - "source": [ - "# Mutability of lists\n", - "- individual elements can be replaced (in place) and the order of a list can be modified\n", - "- therefore, a list can change without creating a new list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b0be527", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4, 5]\n", - "print(my_list)\n", - "my_list[4] = 'e'\n", - "print(my_list)\n" - ] - }, - { - "cell_type": "markdown", - "id": "4b800193", - "metadata": {}, - "source": [ - "# List references\n", - "- this can make your life hard\n", - "- a variable can reference a list instead of containing it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1206cde3", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4, 5]\n", - "print('original list: ', my_list)\n", - "new_list = my_list\n", - "print('apparent copy: ', new_list)\n", - "new_list[0] = 'a'\n", - "print('modified apparent copy:', new_list)\n", - "print('original list: ', my_list)" - ] - }, - { - "cell_type": "markdown", - "id": "c073132f", - "metadata": {}, - "source": [ - "- you need to create a copy of it first\n", - "- there are many ways, for example:\n", - " - create a complete index-based subset `[:]`\n", - " - use the list function `list()`\n", - " - use the copy method `.copy()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1a2b97a", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4, 5]\n", - "print('original list: ', my_list)\n", - "new_list = my_list.copy()\n", - "print('apparent copy: ', new_list)\n", - "new_list[0] = 'a'\n", - "print('modified apparent copy:', new_list)\n", - "print('original list: ', my_list)" - ] - }, - { - "cell_type": "markdown", - "id": "4613038b", - "metadata": {}, - "source": [ - "# Immutable lists: Tuples\n", - "- this datatype cannot be modified in any way without creating a new entity\n", - "- caveat: if a tuple contains a mutable element (e.g. a list), this element can be modified in place, but the tuple itself cannot be modified (i.e. you cannot add or remove elements from the tuple)\n", - "- otherwise they behave identical to lists\n", - "- it is initiated with regular parentheses `(` and `)` or with the function `tuple()`\n", - "- indexing works the same way as with lists" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19d31a21", - "metadata": {}, - "outputs": [], - "source": [ - "my_tuple = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)\n", - "print(my_tuple)\n", - "print(type(my_tuple))\n", - "my_second_tuple = ( 1, 2, 3, [])\n", - "my_second_tuple[3].append('a')\n", - "print(my_second_tuple)" - ] - }, - { - "cell_type": "markdown", - "id": "acff55a6", - "metadata": {}, - "source": [ - "# List and tuple unpacking\n", - "- if the number of variables on the left equals the number of entries on the right, entries are assigned to the left individually" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46310f38", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = ['a', 'b']\n", - "val1, val2 = my_list\n", - "print(val1)\n", - "print(val2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c2b2a38", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3]\n", - "val1, val2, val3 = my_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f75ffaf1", - "metadata": {}, - "outputs": [], - "source": [ - "my_tuple = ( [1, 2, 3], [4, 5, 6] )\n", - "list1, list2 = my_tuple\n", - "print(list1)\n", - "print( type(list1) )" - ] - }, - { - "cell_type": "markdown", - "id": "112fd495", - "metadata": {}, - "source": [ - "# Conversion of a single string into a list or tuple\n", - "- a string gets split up by each character and turned into the respective data format (each letter remains a string)\n", - "- `list()` returns a list of characters\n", - "- `tuple()` returns a tuple of characters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae049522", - "metadata": {}, - "outputs": [], - "source": [ - "string = 'abracadabra'\n", - "print(string)\n", - "print(list(string))\n", - "print(tuple(string))\n", - "print(set(string)) # will be explained later on" - ] - }, - { - "cell_type": "markdown", - "id": "da96237c", - "metadata": {}, - "source": [ - "## the function len()\n", - "- returns the length of a list, string, tuple, dictionary (number of keys) ...\n", - "- numbers do not have a length" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91394fb9", - "metadata": {}, - "outputs": [], - "source": [ - "word = 'Supercalifragilisticexpialidocious'\n", - "print(len(word))\n", - "\n", - "ingredients = [\"1/2 cup sugar\", \"1/2 cup packed brown sugar\", \"3 tablespoons all-purpose flour\", \"1 teaspoon ground cinnamon\", \"1/4 teaspoon ground ginger\", \"1/4 teaspoon ground nutmeg\", \"6 to 7 cups thinly sliced peeled tart apples\", \"1 tablespoon lemon juice\", \"Pastry for double-crust pie (9 inches)\", \"1 tablespoon butter\", \"1 large egg white\"]\n", - "print(len(ingredients))" - ] - }, - { - "cell_type": "markdown", - "id": "94541cd1", - "metadata": {}, - "source": [ - "## the function range()\n", - "- generator function that creates iterators\n", - "- iterators are their own class/data type\n", - "- can be converted to a list via `list()` or a tuple via `tuple()`\n", - "- `range()`\n", - " - creates a list of natural numbers (starts at 0 by default) in a provided interval (default = 1)\n", - " - mandatory length argument\n", - " - optional start argument\n", - " - optional step size argument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15b94dc6", - "metadata": {}, - "outputs": [], - "source": [ - "print( range(5) )\n", - "print( type( range(5) ) )\n", - "print( list( range(5) ) )\n", - "print( list( range(5, 10, 2) ) )" - ] - }, - { - "cell_type": "markdown", - "id": "4a9c68d6", - "metadata": {}, - "source": [ - "## the function enumerate()\n", - "- creates an iterator with an index and the element of the provided list etc\n", - "- optional start argument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03679d03", - "metadata": {}, - "outputs": [], - "source": [ - "l = ['a', 'b', 'c']\n", - "print( type( enumerate(l) ) )\n", - "print( list( enumerate(l, 55) ) )" - ] - }, - { - "cell_type": "markdown", - "id": "9f36ce00", - "metadata": {}, - "source": [ - "# Loops\n", - "\n", - "## `For` loops\n", - "- used to iterate over a sequence (like a list, tuple, dictionary, set, or string)\n", - "- syntax: `for item in sequence:`\n", - "- indented block of code to be executed for each item in the sequence\n", - "\n", - "- Loop controls (`break`, `continue`, `pass`, `else` in loops)\n", - "- Nested loops" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdb969fa", - "metadata": {}, - "outputs": [], - "source": [ - "for value in [1, 2, 3, 4, 5]:\n", - " print(value)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9202d6e1", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(5):\n", - " print(i)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7213865c", - "metadata": {}, - "outputs": [], - "source": [ - "genes = ['BRCA1', 'TP53', 'EGFR', 'VEGFA']\n", - "for index, gene in enumerate(genes, start=1):\n", - " print(f\"Gene {index}: {gene}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4f597fd7", - "metadata": {}, - "source": [ - "## `While` loops\n", - "- used to repeatedly execute a block of code as long as a certain condition is true\n", - "- syntax: `while condition:`\n", - "- indented block of code to be executed while the condition is true" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "889fc250", - "metadata": {}, - "outputs": [], - "source": [ - "value = 10\n", - "while value > 0:\n", - " print(value)\n", - " value -= 1\n", - "print(\"Liftoff!\")" - ] - }, - { - "cell_type": "markdown", - "id": "9ac91093", - "metadata": {}, - "source": [ - "- be aware of infinite loops\n", - "- happen if a certain codition is always met: i.e.\n", - "`while True:` or a test cannot fail and return False" - ] - }, - { - "cell_type": "markdown", - "id": "e8e430b7", - "metadata": {}, - "source": [ - "# Loop controls\n", - "\n", - "## Break\n", - "- keyword to interrupt a loop\n", - "- used without any argument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "effa42cc", - "metadata": {}, - "outputs": [], - "source": [ - "count = 0\n", - "while True:\n", - " count += 1\n", - " if count > 10:\n", - " break\n", - "print(F'Count reached {count}. Done.')" - ] - }, - { - "cell_type": "markdown", - "id": "cf76d2f4", - "metadata": {}, - "source": [ - "## Continue & pass\n", - "- `continue` skips the rest of the current loop iteration and moves to the next iteration\n", - "- `pass` is a placeholder that does nothing, useful for empty loops or functions\n", - "- used without any argument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7061a9", - "metadata": {}, - "outputs": [], - "source": [ - "# notice the difference between both loops\n", - "for n in range(5):\n", - " if n == 2:\n", - " continue\n", - " print(\"continue loop:\", n)\n", - "\n", - "print(\"---\")\n", - "\n", - "for n in range(5):\n", - " if n == 2:\n", - " pass\n", - " print(\"pass loop:\", n)" - ] - }, - { - "cell_type": "markdown", - "id": "419b3678", - "metadata": {}, - "source": [ - "## Else (in loops)\n", - "- can be used with `for` and `while` loops\n", - "- the `else` block is executed when the loop is not terminated by a `break`\n", - "- useful to execute code when a loop completes normally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12d38bf9", - "metadata": {}, - "outputs": [], - "source": [ - "print('First example:')\n", - "for i in range(10):\n", - " if i > 8:\n", - " print('Interrupting first for-loop.')\n", - " break\n", - "else:\n", - " print('First for loop finished without any issues.')\n", - " \n", - "print('\\nSecond example:')\n", - "for i in range(10):\n", - " if i > 10: break\n", - "else:\n", - " print('Second for loop finished without any issues.')\n" - ] - }, { "cell_type": "markdown", "id": "955390ed", "metadata": {}, "source": [ - "# Exercises" + "# Lesson 04\n", + "\n", + "## Exercises" ] }, { @@ -651,11 +15,12 @@ "id": "c1125a90", "metadata": {}, "source": [ - "## Lists\n", + "### Lists\n", + "\n", "- define a list of 5 strings, where each string is a sport\n", "- count the number of sports inside the list\n", "- output each sport with the following line \"I like to play {}\"...\n", - "- for this list, print out each sports first letter in capital form" + "- for this list, print out each sports first letter in capital form\n" ] }, { @@ -840,9 +205,10 @@ "id": "85990d7a", "metadata": {}, "source": [ - "## Tuples\n", + "### Tuples\n", + "\n", "- create a tuple with 5 elements\n", - "- reverse the tuple" + "- reverse the tuple\n" ] }, { @@ -974,9 +340,10 @@ "id": "f0c22f4f", "metadata": {}, "source": [ - "## Built-in functions\n", + "### Built-in functions\n", + "\n", "- use the function `len()` to get the length of this nucleotide sequence `seq` (eGFP) provided below\n", - "- What is the length of it's corresponding protein sequence?" + "- What is the length of it's corresponding protein sequence?\n" ] }, { @@ -1042,14 +409,16 @@ "id": "51fcb3ab", "metadata": {}, "source": [ - "## For loops\n", + "### For loops\n", + "\n", "> Using the string `string` provided below, answer the following questions:\n", + "\n", "- how many elements separated by comma are in the string (provide 1 solutions)\n", "- how many elements have a K in them\n", "- how many elements contain an S but not an L\n", "- how many elements contain a C and do not start with a C\n", "\n", - "> Hint: The following string methods might be useful: `split()`, `count()`, `find()`, `index()`, `startswith()`, `endswith()`, the `in` operator, and the built-in function `len()`." + "> Hint: The following string methods might be useful: `split()`, `count()`, `find()`, `index()`, `startswith()`, `endswith()`, the `in` operator, and the built-in function `len()`.\n" ] }, { @@ -1084,11 +453,12 @@ "id": "b81bdf71", "metadata": {}, "source": [ - "## While loops\n", + "### While loops\n", + "\n", "- write a while loop that increments our counter `count` and but print only multiples of 7\n", "- terminate after printing out 7 numbers\n", "\n", - "> Hint: you can use the modulus operator `%` to check if a number is a multiple of 7 (i.e. `count % 7 == 0`)." + "> Hint: you can use the modulus operator `%` to check if a number is a multiple of 7 (i.e. `count % 7 == 0`).\n" ] }, { diff --git a/solutions/solutions_05.ipynb b/solutions/solutions_05.ipynb index 5fc34d6..a9de22f 100644 --- a/solutions/solutions_05.ipynb +++ b/solutions/solutions_05.ipynb @@ -5,7 +5,9 @@ "id": "8b6bf4b0", "metadata": {}, "source": [ - "# Exercises" + "# Lesson 05\n", + "\n", + "## Exercises\n" ] }, { @@ -13,7 +15,7 @@ "id": "77384052", "metadata": {}, "source": [ - "## Dictionaries\n", + "### Dictionaries\n", "\n", "- Create a dictionary with the following key-value pairs: `name: \"Alice\"`, `age: 30`, `city: \"New York\"`.\n", "- Access the value associated with the key `name` and print it using an f-string.\n", @@ -94,13 +96,13 @@ "- Advanced task: Sort the dictionary starting with the lowest count using the `sorted()` function and print the results.\n", "\n", "> HINT: The sorted() function will by default sort a dictionary by its keys.\n", - " To sort by values, you can use the `key` parameter with a lambda function that specifies to sort by the second item in the key-value pair (the value):\n", - " `sorted(dict.items(), key=lambda item: item[1])`\n" + "> To sort by values, you can use the `key` parameter with a lambda function that specifies to sort by the second item in the key-value pair (the value):\n", + "> `sorted(dict.items(), key=lambda item: item[1])`\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "244eebf3", "metadata": {}, "outputs": [ @@ -135,7 +137,7 @@ "print(dict(sorted([(value, key) for key, value in dict2.items()])))\n", "\n", "# or better, sort by using the key parameter\n", - "print(dict(sorted(dict2.items(), key=lambda item: item[1])))\n" + "print(dict(sorted(dict2.items(), key=lambda item: item[1])))" ] }, { @@ -143,9 +145,9 @@ "id": "af2cee16", "metadata": {}, "source": [ - "## Sets\n", + "### Sets\n", "\n", - "- Create a set of unique numbers from the list below" + "- Create a set of unique numbers from the list below\n" ] }, { @@ -253,7 +255,7 @@ "id": "96bf460c", "metadata": {}, "source": [ - "## List comprehensions\n", + "### List comprehensions\n", "\n", "- Create a list of only even numbers from 0 to 20 (inclusive) using `range`.\n", "- Create a second list that contains the squares of those numbers using a _list comprehension_.\n", @@ -294,7 +296,7 @@ "id": "5c18b5a0", "metadata": {}, "source": [ - "## Dict comprehensions\n", + "### Dict comprehensions\n", "\n", "- Using the following list of Countries, create a dictionary where the keys are the country names and the values are the lengths of the country names.\n", "- Filter for Countries whose names exceed a length of 10 characters.\n" diff --git a/solutions/solutions_06.ipynb b/solutions/solutions_06.ipynb index adfa19b..a61421c 100644 --- a/solutions/solutions_06.ipynb +++ b/solutions/solutions_06.ipynb @@ -2,445 +2,20 @@ "cells": [ { "cell_type": "markdown", - "id": "34da94b2", - "metadata": {}, - "source": [ - "\n", - "# Lesson 6" - ] - }, - { - "cell_type": "markdown", - "id": "78baa9df", - "metadata": {}, - "source": [ - "Recap of previous lesson:\n", - "- dictionaries\n", - "- sets and set operations\n", - "- comprehensions (list, dict, set)" - ] - }, - { - "cell_type": "markdown", - "id": "3e36cc83", - "metadata": {}, - "source": [ - "In this lesson, we will cover:\n", - "- functions\n", - "- global vs local variables\n", - "- file handling" - ] - }, - { - "cell_type": "markdown", - "id": "6bc54d5b", - "metadata": {}, - "source": [ - "## Functions\n", - "- functions are blocks of code that can be referenced and easily reused\n", - "- can utilize parameters (arguments)\n", - "- argument order matters, but can be circumvented by using specific argument names (kwargs = keyword arguments)\n", - "- runs when called\n", - "- functions handle variables isolated from the outside code (usually, **exceptions apply**)\n", - "- can return results via `return`, which is optional\n", - "- defined by the keyword `def`\n", - "- requires a name (same rules apply as for variables, i.e. no digits at the beginning or no spaces)\n", - "- can have defaults (defined after the list of arguments)\n", - "- optionally (but good practise), each function has a doc string describing the function's purpose commonly using triple quotes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5c4b6cc", - "metadata": {}, - "outputs": [], - "source": [ - "# function w/o arguments\n", - "\n", - "def my_function():\n", - " res = 'Hello from inside the function!' \n", - " print(res) \n", - " return res\n", - " \n", - "my_function()\n", - "print('Hello back from the outside of the function.')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "665eee0e", - "metadata": {}, - "outputs": [], - "source": [ - "# basic arguments for function\n", - "def ratio(num1, num2):\n", - " '''calculates the ratio of 'num1' vs 'num2'.'''\n", - " return num1 / num2\n", - "\n", - "print( ratio(4, 2) )\n", - "\n", - "# help text via the built-in `help` function\n", - "help(ratio)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baa10fdb", - "metadata": {}, - "outputs": [], - "source": [ - "def einstein():\n", - " \"\"\"Prints a message but does not return anything.\"\"\"\n", - " print(\"Insanity: doing the same thing over and over again and expecting different results.\")\n", - " \n", - "result = einstein()\n", - "print('\\nReturn:', result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf0a6932", - "metadata": {}, - "outputs": [], - "source": [ - "# function with named arguments\n", - "def calc_difference(val1, val2):\n", - " diff1 = val1 - val2\n", - " return diff1\n", - "\n", - "print( calc_difference(6, 3) )\n", - "print( calc_difference(val1=3, val2=6) )\n", - "print( calc_difference(val2=3, val1=6) )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b827163b", - "metadata": {}, - "outputs": [], - "source": [ - "# function with defaults\n", - "def new_function(arg1, arg2, arg3='test', args4='another one'):\n", - " some_result = arg1 * arg2\n", - " return (some_result, arg3, args4)\n", - "\n", - "print( new_function(5, 6) )\n", - "print( new_function(8, 7, 'bla') )\n", - "print( new_function(8, 7, 'bla', 'foo') )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26fa7568", - "metadata": {}, - "outputs": [], - "source": [ - "# function with defined input and output data types\n", - "def new_function(arg1: list, arg2: int, arg3: str ='test') -> str: #\n", - " some_result = f\"result for {arg3}: {sum(arg1) * arg2}\"\n", - " return some_result\n", - "\n", - "print( new_function([2, 3], 3) )\n", - "print( new_function([1, 2], 2, 'c') )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31baec33", - "metadata": {}, - "outputs": [], - "source": [ - "print( new_function(['a','b'], 'a') )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d752037", - "metadata": {}, - "outputs": [], - "source": [ - "# function with defined input and output data types - example for different return type\n", - "def new_function(arg1: list, arg2: int, arg3: str ='test') -> int:\n", - " some_result = str(arg1) * arg2 + arg3\n", - " return some_result\n", - "\n", - "print( new_function(['a','b'], 3) )\n", - "print('The anticipated return data type is not being checked and will not raise an error!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cda70047", - "metadata": {}, - "outputs": [], - "source": [ - "# isolated variables\n", - "def isolated(a, b):\n", - " a = a + b\n", - " return [a, b]\n", - "\n", - "a = 20\n", - "b1 = 5\n", - "res = isolated(a, b1)\n", - "\n", - "print(a, b1)\n", - "print(res)" - ] - }, - { - "cell_type": "markdown", - "id": "6485cfde", - "metadata": {}, - "source": [ - "## Additional comments on functions\n", - "- there are a lot of build-in functions like `len()` in Python\n", - "- the complete list can be found here: https://docs.python.org/3/library/functions.html\n", - "- **never define a function name that overwrites a build-in function** (it will mess with your code in unpredictable ways)\n", - "- the same is true for all keywords like `def`, `for`, `in` etc. (though the interpreter will just report an error)\n", - "- the full list of keywords can be found here: https://docs.python.org/3/reference/lexical_analysis.html#keywords" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92355eff", - "metadata": {}, - "outputs": [], - "source": [ - "# example overwriting a built in function\n", - "print( sum([1, 2, 3]) )\n", - "\n", - "def sum(a):\n", - " return a\n", - "\n", - "print( sum([1, 2, 3]) )" - ] - }, - { - "cell_type": "markdown", - "id": "ec2e75fa", - "metadata": {}, - "source": [ - "- dictionaries can be used as keyword input for functions\n", - "- the dictionary needs to match the function arguments completely\n", - "- unpack dicts in function arguments using the `**` operator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4eed7da1", - "metadata": {}, - "outputs": [], - "source": [ - "def multiply(factor1, factor2):\n", - " return factor1 * factor2\n", - "\n", - "print( multiply(2, 3) )\n", - "\n", - "my_dict = {'factor1': 5, 'factor2': 6}\n", - "print( multiply(**my_dict) )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "638ba311", - "metadata": {}, - "outputs": [], - "source": [ - "my_dict = {'factor1': 2, 'factor2': 3, 'factor3': 4}\n", - "print( multiply(**my_dict) )" - ] - }, - { - "cell_type": "markdown", - "id": "b6b8f745", - "metadata": {}, - "source": [ - "## Global vs local variables\n", - "- variables defined within a function are local to that function and cannot be accessed outside of it\n", - "- variables defined outside of any function are global and can be accessed from anywhere in the code, including inside functions (but not modified unless declared as global within the function)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e839327d", - "metadata": {}, - "outputs": [], - "source": [ - "def function_l():\n", - " s = \"I love Minneapolis!\"\n", - " print('Var s inside local function:', s, '\\n')\n", - "\n", - "def function_g():\n", - " global s\n", - " s = \"I love Seattle!\"\n", - " print('Var s inside global function:', s, '\\n')\n", - "\n", - "s = \"I love NYC!\" \n", - "print('Var s outside any function:', s, '\\n')\n", - "\n", - "function_l()\n", - "print('Var s outside local function:', s, '\\n')\n", - "\n", - "function_g()\n", - "print('Var s outside global function:', s)" - ] - }, - { - "cell_type": "markdown", - "id": "182e2169", - "metadata": {}, - "source": [ - "> Personal recommendation: avoid global variables, they can make your code very hard to debug and understand" - ] - }, - { - "cell_type": "markdown", - "id": "f28f70c9", + "id": "9ff1a083", "metadata": {}, "source": [ - "# Handling files\n", - "- files are an important concept to store and access information\n", - "- to handle files, Python can read and write humanly readable files (more or less) directly\n", - "- for more complex input files (i.e. compressed or binary data files like bam), external libraries are required\n", + "# Lesson 06\n", "\n", - "
\n", + "## Exercises\n", "\n", - "- to open a file, we use the built-in function `open(filename,mode)`, which creates an iterator by line over the file\n", - "- to close a file, we use the method `.close()`. This makes sure, that everything has been written to the file. \n", + "### Functions\n", "\n", - "| mode | meaning | comment |\n", - "| :---: | :---: | :--- |\n", - "| 'r' | read | default |\n", - "| 'w' | write | overwrites files |\n", - "| 'a' | append | adds to the end of an existing file |\n", - "\n", - "- closing a data stream too early (i.e. when writing) will **not** create an error, but will become a problem later on" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c5ea49b", - "metadata": {}, - "outputs": [], - "source": [ - "infile = open('../data/seqs.fas', 'r')\n", - "for line in infile:\n", - " print(repr(line)) # repr() shows the string as it is stored in memory, including special characters like \\n for newlines\n", - " print(line)\n", - "infile.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2b62fc1", - "metadata": {}, - "outputs": [], - "source": [ - "outfile = open('../data/new_seqs.fas', 'w')\n", - "for e, seq in enumerate(['aaaa', 'cccc', 'gggg', 'tttt']):\n", - " outfile.write(F\">{e}\\n{seq}\\n\")\n", - "outfile.close() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c3f06fb", - "metadata": {}, - "outputs": [], - "source": [ - "infile = open('../data/new_seqs.fas', 'r')\n", - "for line in infile:\n", - " print(line.strip()) # removes white spaces and line breaks on the right side, i.e. aaaa\\n -> aaaa\n", - "infile.close()" - ] - }, - { - "cell_type": "markdown", - "id": "543d3fda", - "metadata": {}, - "source": [ - "- there are alternative semantic ways to access files\n", - "- syntax is `with as :` \n", - "- this implies a proper closing when leaving this code construct / finishing processing\n", - "- the keyword `as` function as an alias" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62b4eb0a", - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/seqs.fas', 'a') as outfile:\n", - " for e, seq in enumerate(['acgt', 'tgca'], start=4): # optional start parameter for enumerate, default is 0\n", - " outfile.write(F\">{e}\\n{seq}\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2adb46a", - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/seqs.fas', 'r') as infile:\n", - " for line in infile:\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "014e605c", - "metadata": {}, - "source": [ - "# Reading a whole file\n", - "- it is possible to read a whole file with the method `.read()` from an open file stream without processing it line by line\n", - "- this might not be advisable in most situations, because it just fills up the memory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "948f8bd3", - "metadata": {}, - "outputs": [], - "source": [ - "with open('../data/seqs.fas', 'r') as infile:\n", - " content = infile.read()\n", - "\n", - "print( repr(content) )" - ] - }, - { - "cell_type": "markdown", - "id": "9ff1a083", - "metadata": {}, - "source": [ - "# Exercises\n", - "\n", - "## Functions\n", "- write a function that serves as a calculator for the four basic operations (addition, subtraction, multiplication, division)\n", "- the function should take three arguments: the first number, the second number, and the operation as a string (i.e. \"add\" or \"+\")\n", "- the function should return the result of the operation\n", "- the function should handle division by zero gracefully (i.e. return \"undefined\" or something similar)\n", - "- the function should have a doc string explaining its purpose and usage (enclosed by three quotes `'''`)" + "- the function should have a doc string explaining its purpose and usage (enclosed by three quotes `'''`)\n" ] }, { @@ -582,9 +157,10 @@ "id": "2958315e", "metadata": {}, "source": [ - "## file handling\n", + "### File handling\n", + "\n", "- create a function to create a new file and write the sequence from the previous exercise (ORFs) into it\n", - "- read the file and print its content to the console" + "- read the file and print its content to the console\n" ] }, { diff --git a/solutions/solutions_07.ipynb b/solutions/solutions_07.ipynb index 56ac19b..4f3c9d5 100644 --- a/solutions/solutions_07.ipynb +++ b/solutions/solutions_07.ipynb @@ -1,1052 +1,15 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "5845bb81", - "metadata": {}, - "source": [ - "# Lesson 7\n", - "\n", - "## Recap of the last lesson\n", - "- functions\n", - "- local and global variables\n", - "- file handling\n", - "\n", - "## In this lesson, we will cover:\n", - "- external libraries:\n", - " - random\n", - " - gzip\n", - " - argparse\n", - " - math\n", - " - re\n", - " - numpy\n", - " - pandas\n", - "- tidy data" - ] - }, - { - "cell_type": "markdown", - "id": "434622f8", - "metadata": {}, - "source": [ - "## External libraries in general\n", - "- Python has a large ecosystem of external libraries\n", - "- To load a library use ```import ```, i.e. ```import random```\n", - "- Aliases can be used to shorten the name of the library, i.e. ```import seaborn as sns```\n", - "- The default installation comes with external libraries, more can be installed using ```pip``` or ```conda```\n", - "- Installation happens outside of the python code, i.e. in the terminal:\n", - "```python3 -m pip install seaborn```\n", - "- Once installed, they can be imported and used in your Python code\n", - "
\n", - "\n", - "- Jupyter has it's own \"isolated\" environments\n", - " - first test, if a certain library is already available by importing it in your code\n", - " - if not, use the following code once (it will stay in your Jupyter environment and you do not need to install it again for that instance)\n", - " - `import sys`\n", - " - `!{sys.executable} -m pip install `, i.e. `!{sys.executable} -m pip install biopython`\n", - "\n", - "
\n", - "\n", - "> We will not be able to even scratch the surface of what is possible with external libraries" - ] - }, - { - "cell_type": "markdown", - "id": "97feb185", - "metadata": {}, - "source": [ - "# Library random\n", - "- Library to create (pseudo-) random numbers / selections (integers, sequences, distributions, etc.)\n", - "- Useful for simulations, shuffling data, random sampling, etc.\n", - "- Not suitable for cryptographic purposes\n", - "- For further information see https://docs.python.org/3.6/library/random.html\n", - "- Selected functions:\n", - " - `random.random()`: returns a random float between 0.0 and 1.0\n", - " - `random.randint(a, b)`: returns a random integer N such that a <= N <= b\n", - " - `random.choice(seq)`: returns a random element from the non-empty sequence seq\n", - " - `random.choices(seq, k=i)`: returns a list of elements ```i``` chosen from the non-empty sequence seq **with** replacement\n", - " - `random.sample(population, k)`: returns a list of k unique elements chosen from the population sequence or set **without** replacement\n", - " - `random.shuffle(x)`: shuffles the sequence x in place" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "970c5806", - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "#examples:\n", - "print('Random integer:', random.randint(5, 10)) # a random integer N such that a <= N <= b\n", - "\n", - "list1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", - "print('Random choice:', random.choice(list1)) # a random element from list\n", - "\n", - "random.shuffle(list1)\n", - "print('In-place shuffle:', list1) #Shuffle the sequence x in place\n", - "\n", - "print('Unique subset:', random.sample(list1, 4)) # a k length list of unique elements from population. random sampling without replacement\n", - "\n", - "print('Random float, uniform distribution:', random.random()) # random float number [0.0, 1.0]\n", - "\n", - "print('Random float gauss distribution:', random.gauss(mu=0.5, sigma=0.66)) #Gaussian distribution, mu=mean, sigma=standard deviation\n", - "\n", - "print('Random sequence:', ''.join( random.choices( list('ACTG'), k=50) ) )" - ] - }, - { - "cell_type": "markdown", - "id": "c74c0b37", - "metadata": {}, - "source": [ - "# Library gzip\n", - "- gzip is a library to read and write gzip files, which are compressed files that save disk space and can be read faster than uncompressed files\n", - "- gzip files are binary and not human-readable\n", - "- can be used very similarly to the built-in `open()` function, but with the added benefit of compression\n", - "- i.e. sequencing data (FASTQ files) are often stored in gzip format to save space\n", - "\n", - "
\n", - "\n", - "- Syntax:\n", - " - ```import gzip```\n", - " - ```with gzip.open(filename, mode) as infile:```)\n", - " - Modes:\n", - " - `rt` for reading text files (default is `rb` for reading binary files)\n", - " - `wt` for writing text files (`wb` for writing binary files)\n", - " - `at` for appending text files (`ab` for appending binary files)\n", - "> reading from disk is slower than extracting from memory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26aeb07f", - "metadata": {}, - "outputs": [], - "source": [ - "import gzip\n", - "\n", - "with gzip.open('../data/my_compressed_data.txt.gz', 'wt') as outfile:\n", - " outfile.write('test data 123456789')\n", - "\n", - "try:\n", - " with open('data/my_compressed_data.txt.gz', 'r') as infile:\n", - " print(infile.read())\n", - "except Exception as e:\n", - " print('Error reading compressed file:', e, '\\n')\n", - "\n", - "with gzip.open('../data/my_compressed_data.txt.gz', 'rt') as infile:\n", - " print(infile.read())" - ] - }, - { - "cell_type": "markdown", - "id": "df81b668", - "metadata": {}, - "source": [ - "# Library argparse\n", - "- argument parser used for handing parameters over when starting a program via command line\n", - "- requires object initiation via `argparse.ArgumentParser()`\n", - "- optional argument `description`\n", - "- requires parameters to work via `.add_argument()`\n", - "- minimum arguments for a program parameter are the parameter indicator and the destination for storing the input\n", - "- Destination name becomes object attribute to address stored values\n", - "- many optional parameters:\n", - " - `default`\n", - " - `action`\n", - " - `help`\n", - " - `type`\n", - "- object-specific function needs to be called to get all parameters as a global variable\n", - "- for further information read https://docs.python.org/3/library/argparse.html\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "854c0a6b", - "metadata": {}, - "outputs": [], - "source": [ - "import argparse\n", - "parser = argparse.ArgumentParser(description='this is a test program')\n", - "parser.add_argument('-i', dest='info')\n", - "parser.add_argument('--unit','-u', dest='unit_test', default=False, action='store_true', help='run unit test')\n", - "args = parser.parse_args()\n", - "\n", - "# print(args.info)\n", - "# print(args.unit_test)" - ] - }, - { - "cell_type": "markdown", - "id": "aef48781", - "metadata": {}, - "source": [ - "This becomes relevant when we start writing our own programs and want to make them more flexible by allowing users to specify parameters when running the program via command line.\n", - "\n", - "Example:\n", - "`python my_program.py --input_file ../data/my_input.txt --output_file ../data/my_output.txt --num_iterations 1000`" - ] - }, - { - "cell_type": "markdown", - "id": "8085b3c8", - "metadata": {}, - "source": [ - "# Library math\n", - "- Large variety of math operations:\n", - " - Number-theoretic and representation functions\n", - " - Power and logarithmic functions\n", - " - Trigonometric functions\n", - " - Angular conversion\n", - " - Hyperbolic functions\n", - " - Special functions\n", - " - Constants\n", - "- For further information see https://docs.python.org/3/library/math.html\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "810e744a", - "metadata": {}, - "outputs": [], - "source": [ - "import math\n", - "#examples:\n", - "print(math.pi) # mathematical constant Ο€ = 3.141592…\n", - "print(math.e) # mathematical constant e = 2.718281\n", - "\n", - "pi = math.pi\n", - "\n", - "print('\\nRound properly:', round(pi, 5) ) # built-in rounding function\n", - "\n", - "print('Round up:', math.ceil(2.2) )\n", - "print('Round down:', math.floor(3.5) )\n", - "print('Remainder:', math.fmod(7, 2) ) # 7 % 2\n", - "print('Log:', math.log(256, 2)) # logarithmic function (number, base)\n", - "print('Exponential:', math.pow(2, 10.5) ) # x to the power of y, 2**10.5\n", - "print('Square root:', math.sqrt(81) )\n", - "print('Arc sine:', math.asin(.5) ) # in radians\n", - "print('Sine:', math.sin(pi / 2) )\n", - "print('Degrees:', math.degrees(pi) ) # convert x from radians to degrees\n", - "print('Gamma:', math.gamma(3) ) # Gamma function at x\n" - ] - }, - { - "cell_type": "markdown", - "id": "e44e0357", - "metadata": {}, - "source": [ - "# Library re\n", - "- `re` stands for `regular expression`, aka `regex`\n", - "- concept for text pattern matching\n", - "- Python converts the search pattern into a bytestring to search very efficiently in a memory object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1616d23", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "# reading file\n", - "with open('../data/towels.txt', 'r') as infile:\n", - " story = infile.read()\n", - "\n", - "# create pattern\n", - "pattern = '.*[H|h]itch *[H|h]iker.*'\n", - "regex = re.compile(pattern)\n", - "\n", - "# search for pattern and print each line with the pattern in it\n", - "print( regex.findall(story) )" - ] - }, - { - "cell_type": "markdown", - "id": "93a3e5a6", - "metadata": {}, - "source": [ - "- certain strings have specific meanings:\n", - " - `.*` = any number of any character before/after our pattern except `\\n`, including 0 observations\n", - " - `*` = any number of any character within our pattern, including 0 observations\n", - " - `[H|h]` = one character, eiter an `H` or an `h`\n", - "- when compiling the search pattern, you can include certain flags\n", - " - `re.IGNORECASE` to have case insensitive matching\n", - " - `re.DOTALL` to have the `.` match all characters incl. the line end character `\\n`\n", - " - `re.MULTILINE` to handle multiple lines in a string separately, relevant for:\n", - " - `^` = beginning of a string / line\n", - " - `$` = end of a string/line\n", - "- to combine multiple flags, use the vertical line `|`, i.e. `re.DOTALL | re.MULTILINE`" - ] - }, - { - "cell_type": "markdown", - "id": "5e10a102", - "metadata": {}, - "source": [ - "- some special characters in search pattern:\n", - "\n", - "| Character | Meaning |\n", - "| :---: | :--- |\n", - "| . | any character except new line '\\n' |\n", - "| ^ | at the beginning of a string |\n", - "| $ | at the end of a string |\n", - "| * | multiplier >= 0 |\n", - "| + | multiplier >=1 |\n", - "| ? | multiplier 0-1 |\n", - "| {m} | specific multiplier, i.e. {3} |\n", - "| {m,n} | multiplier range, i.e. {2,4}, also {,4} or {4,} for half-open ranges |\n", - "| [ ] | character set to choose from, i.e. [ACGT], special characters become normal characters, i.e. [ab*] |\n", - "| [a-z] | a single lower case letter |\n", - "| [0-9] | a single digit |\n", - "| \\ | escape character, i.e. \\* is an asterisk and not a multiplier |\n", - "| \\| | logical or when combining |\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "8964ea64", - "metadata": {}, - "source": [ - "- several subfunctions are available for a pattern object\n", - "- below is an overview of the search functions and their result\n", - "- all expect a compiled pattern via `re.compile()` and the string to search in, flags can always be added after the string\n", - "\n", - "\n", - "| Subfunction | Description |\n", - "| :--- | :--- |\n", - "| `pattern.search(string)` | first match object |\n", - "| `pattern.match(string)` | matching object, but tests only the beginning of the string |\n", - "| `pattern.fullmatch(string)` | matching object only if whole string matches, otherwise returns RE |\n", - "| `pattern.findall(string)` | list of match |\n", - "| `pattern.finditer(string)` | iterator over match objects, similar to list of `.findall()` |\n", - "| `pattern.split(string,maxsplit=0)` | splits string based on occurance of the pattern, limited by maxsplit |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "423d4b35", - "metadata": {}, - "outputs": [], - "source": [ - "sequence = '''ATGGATAAGAAATACTCAATAGGCTTAGATATCGGCACAAATAGCGTCGGATGGGCGGTGATCACTGATG\n", - "AATATAAGGTTCCGTCTAAAAAGTTCAAGGTTCTGGGAAATACAGACCGCCACAGTATCAAAAAAAATCT\n", - "TATAGGGGCTCTTTTATTTGACAGTGGAGAGACAGCGGAAGCGACTCGTCTCAAACGGACAGCTCGTAGA'''\n", - "pattern = re.compile('AT[ACT]GG[ACGT]')# represents AA sequence 'IG' = Isoleucine + Glycine\n", - "\n", - "print('first match', pattern.search(sequence))\n", - "print('match beginning', pattern.match(sequence))\n", - "print('whole string match', pattern.fullmatch(sequence))\n", - "print('list of matches', pattern.findall(sequence))\n", - "print('iterator for matches', pattern.finditer(sequence))\n", - "print('split at matches', pattern.split(sequence))\n" - ] - }, - { - "cell_type": "markdown", - "id": "fad61f15", - "metadata": {}, - "source": [ - "# Pandas and Numpy\n", - "- both libraries are very powerful tools to handle and to process large data\n", - "- Numpy is optimized to store large numerical data and to perform computations on\n", - " - commonly imported as `np`\n", - "- Pandas is optimized for data exploration and modification\n", - " - commonly imported as `pd`\n", - "\n", - "## Numpy\n", - "- central data structure object is `ndarray()`\n", - "- a list can be converted into such an array directly via `np.array()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a66b89b2", - "metadata": {}, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install numpy\n", - "\n", - "import numpy as np\n", - "arr = np.array([1, 2, 3, 4])\n", - "print(arr)\n", - "print('array data type:', type(arr))\n", - "\n", - "print('dtype of array entries:', arr.dtype)\n", - "\n", - "print('shape of the array:', arr.shape)\n", - "\n", - "# arrays have predefined data types, but those can be converted\n", - "arr.astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0c8f62b", - "metadata": {}, - "outputs": [], - "source": [ - "# lots of common statistical subfunctions\n", - "print('max ', arr.max())\n", - "print('min ', arr.min())\n", - "print('sum ', arr.sum())\n", - "print('mean', arr.mean())\n", - "print('SD ', arr.std())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8fe5c89b", - "metadata": {}, - "outputs": [], - "source": [ - "# inversely, you can use the functions from the numpy library and hand over the array\n", - "print('max ', np.max(arr))\n", - "print('min ', np.min(arr))\n", - "print('sum ', np.sum(arr))\n", - "print('mean', np.mean(arr))\n", - "print('SD ', np.std(arr))" - ] - }, - { - "cell_type": "markdown", - "id": "fbc69f26", - "metadata": {}, - "source": [ - "- there are many ways to create a numpy array\n", - " - `np.zeros()` creates an array of a given length with `0`\n", - " - `np.ones()` creaes an array of a given length with `1`\n", - " - `np.empty()` creaes an empty array\n", - " - WARNING: this creates only a memory location. Unless overwritten, every `cell` will have an unknown entry\n", - " - arrays can have many more dimensions: `np.array([[1,2],[3,4]])`\n", - " - `np.zeros_like()` creates an array with the same dimension as the array provided, filled with `0`\n", - "- index methods work similar to lists in Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ba1f7f2", - "metadata": {}, - "outputs": [], - "source": [ - "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('array ', arr)\n", - "print('inversion ', arr[::-1])\n", - "print('slice ', arr[3:6])\n", - "print('element selection', arr[::3])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "355fd3aa", - "metadata": {}, - "outputs": [], - "source": [ - "# conditional selection\n", - "print('conditional', arr[arr > 5])\n", - "print('index list ', np.where(arr > 5))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1a90faf", - "metadata": {}, - "outputs": [], - "source": [ - "# numpy arrays are mutable (entries can be changed)\n", - "print(arr)\n", - "arr[5] *= 100\n", - "print(arr)" - ] - }, - { - "cell_type": "markdown", - "id": "f337af74", - "metadata": {}, - "source": [ - "## Numpy linked arrays\n", - "- referencing to an array creates a link, not a copy of the array\n", - "- changing the value in one will effect the other one\n", - "- this is also true for all index based slices\n", - "- to get a copy, use numpy's copy function `np.copy()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dcd5b983", - "metadata": {}, - "outputs": [], - "source": [ - "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", - "arr2 = arr\n", - "arr2[1] = 0\n", - "print('arr2', arr2)\n", - "print('arr ', arr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9711ba64", - "metadata": {}, - "outputs": [], - "source": [ - "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", - "arr2 = arr[::2]\n", - "arr2[:] = 50\n", - "print('arr2', arr2)\n", - "print('arr ', arr)\n", - "\n", - "# vs a copy of the array\n", - "\n", - "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", - "arr2 = np.copy(arr[::2])\n", - "arr2[:] = 50\n", - "print('arr2 (copy)', arr2)\n", - "print('arr ', arr)" - ] - }, - { - "cell_type": "markdown", - "id": "771324fa", - "metadata": {}, - "source": [ - "- Numpy arrays can be n-dimensional\n", - "- you can either reshape an existing one `.reshape(, , ...)`\n", - " - obviously, the total number of values must match\n", - "- for predefined zeros or ones, you can create an array with specified dimensions parameter `shape = (, , ...)`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8d665ea", - "metadata": {}, - "outputs": [], - "source": [ - "arr = np.zeros(shape=(3, 5, 2))\n", - "print(arr)\n", - "print('---')\n", - "arr_new = arr.reshape(5, 6) # creates a copy\n", - "print(arr_new)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20f997c3", - "metadata": {}, - "outputs": [], - "source": [ - "# index based access to multidimensional arrays:\n", - "arr = np.zeros(shape=(2, 3, 2))\n", - "\n", - "arr[1][2] = 10 # Python list method\n", - "print(arr, '\\n')\n", - "\n", - "arr[1, 2] *= 2 # numpy method\n", - "print(arr)" - ] - }, - { - "cell_type": "markdown", - "id": "93bec0ef", - "metadata": {}, - "source": [ - "## Why to use numpy?\n", - "- numpy (and scipy) are fast, really fast\n", - "- for demonstration purposes, we will create 10K random numbers and add them together. We will repeat the step for the addition several times and test the performance with a (Jupyter) built-in function `%timeit`\n", - "- we will compare numpy with a for loop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e0ca390", - "metadata": {}, - "outputs": [], - "source": [ - "# Make array of 10,000 random numbers\n", - "x = np.random.random(10000)\n", - "\n", - "# Sum with Python's for loop\n", - "def python_sum(x):\n", - " x_sum = 0.0\n", - " for y in x:\n", - " x_sum += y\n", - " return(x_sum)\n", - "\n", - "# Test speed\n", - "print(\"For loop:\")\n", - "%timeit python_sum(x)\n", - "\n", - "print(\"\\nPython sum function:\")\n", - "%timeit sum(x)\n", - "\n", - "import math\n", - "print(\"\\nMath sum function:\")\n", - "%timeit math.fsum(x)\n", - "\n", - "print(\"\\nNumpy:\")\n", - "%timeit np.sum(x)\n" - ] - }, - { - "cell_type": "markdown", - "id": "5ab3114b", - "metadata": {}, - "source": [ - "## Pandas\n", - "- central data structure object is `DataFrame()`\n", - "- a table with rows and columns, similar to an Excel sheet\n", - "- columns can be of different data types, i.e. one column can be integers, another one can be strings, etc.\n", - "- rows and columns can be indexed via labels or via integer based indexing\n", - "- many functions to explore and modify the data, i.e. `df.head()`, `df.describe()`, `df.drop()`, etc." - ] - }, - { - "cell_type": "markdown", - "id": "4abae2f5", - "metadata": {}, - "source": [ - "- we will use example data from the public Dryad repository (sleep and reaction time data) https://doi.org/10.5061/dryad.r620r\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fb5e959", - "metadata": {}, - "outputs": [], - "source": [ - "# showing first 10 lines of the file, missing data indicated by *\n", - "print('\\n'.join(open('data/gfmt_sleep.csv').read().split('\\n')[:10]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4fde985", - "metadata": {}, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install pandas\n", - "\n", - "import pandas as pd\n", - "\n", - "df = pd.read_csv('../data/gfmt_sleep.csv', na_values='*')\n", - "type(df)" - ] - }, - { - "cell_type": "markdown", - "id": "29c5a38a", - "metadata": {}, - "source": [ - "- using jupyter notebooks, the data presentation of panda dataframes will always look pretty\n", - "- the subfunction `.head()` allows you to inspect the first few rows" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62bb7124", - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "604b70a9", - "metadata": {}, - "source": [ - "- pandas dataframes work partially like a dictionary\n", - "- indexing is done by column names first, followed by a row index (number), starting pythonic with `0`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0bc4558", - "metadata": {}, - "outputs": [], - "source": [ - "df['age']\n", - "print()\n", - "df['age'][0]" - ] - }, - { - "cell_type": "markdown", - "id": "2567f5ec", - "metadata": {}, - "source": [ - "- a better way to work with pandas dataframes is, to use the `.loc[]` attribute, starting with a row index and a comma-separated column name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "311a8a2f", - "metadata": {}, - "outputs": [], - "source": [ - "df.loc[1,'age']" - ] - }, - { - "cell_type": "markdown", - "id": "97a83e60", - "metadata": {}, - "source": [ - "- usually, for data exploration, you would not address a specific entry, but something conditional" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ea69a94", - "metadata": {}, - "outputs": [], - "source": [ - "df.loc[df['Ppt No.'] == 42, 'overall percent correct']" - ] - }, - { - "cell_type": "markdown", - "id": "cb05b776", - "metadata": {}, - "source": [ - "- if you do not want to specify the column (or row), use a colon `:` instead" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d946ecbd", - "metadata": {}, - "outputs": [], - "source": [ - "df.loc[df['Ppt No.'] == 42, :]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8f4fdd2", - "metadata": {}, - "outputs": [], - "source": [ - "df.loc[: , 'overall percent correct']" - ] - }, - { - "cell_type": "markdown", - "id": "6e3b0446", - "metadata": {}, - "source": [ - "- to combine conditional operations, use the ambersand `&`\n", - "- you can combine as many operations as you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3524dc9a", - "metadata": {}, - "outputs": [], - "source": [ - "# getting all females under the age of 21\n", - "df.loc[(df['age'] < 21) & (df['gender'] == 'f'), :]" - ] - }, - { - "cell_type": "markdown", - "id": "ffe8bf11", - "metadata": {}, - "source": [ - "- the ambersand `&` creates a boolean array (True/False) that is directly applied to the df in the example before\n", - "- to store that boolean array, just assign it to a variable\n", - "- a 1D array is called a series" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "698ff1c0", - "metadata": {}, - "outputs": [], - "source": [ - "inds = (df[\"age\"] < 30) & (df[\"gender\"] == \"f\") & (df[\"overall percent correct\"] > 85)\n", - "\n", - "print(inds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f1ee724", - "metadata": {}, - "outputs": [], - "source": [ - "# re-applying our True/False array\n", - "df.loc[inds, :]#we want to see all columns" - ] - }, - { - "cell_type": "markdown", - "id": "152b39bd", - "metadata": {}, - "source": [ - "- the data selection and handling happens more conveniently comparred to nested lists, through which you have to loop through (i.e. `for` loops)\n", - "- internally, the process is also much faster, which comes into play for very large datasets and repetetive tasks\n", - "\n", - "
\n", - "\n", - "- in this study, a person suffers from insomnia, if they have a SCI of <=16\n", - "- we want to select all people and create a new column with that information\n", - "- doing any calculations within an array instead of going through each element is called vectorization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be180923", - "metadata": {}, - "outputs": [], - "source": [ - "df['insomnia'] = df['sci'] <= 16\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f28c7617", - "metadata": {}, - "source": [ - "## Applying functions to Pandas dataframes\n", - "- you can apply functions to the pandas dataframe or to a subset of it\n", - "- usually, the more complex functions should come from numpy because there were optimized to work very fast and memory efficient on a large scale\n", - "- in pandas, the operator `~` works like the keyword `not` in Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c816c3f5", - "metadata": {}, - "outputs": [], - "source": [ - "#calculating the mean of some columns\n", - "mean_correct_pos = np.mean(df.loc[df['insomnia'], 'overall percent correct'])\n", - "mean_correct_neg = np.mean(df.loc[~df['insomnia'], 'overall percent correct'])\n", - "print(F\"Positive cases : {mean_correct_pos:.2f}\")\n", - "print(F\"Negative controls: {mean_correct_neg:.2f}\")" - ] - }, - { - "cell_type": "markdown", - "id": "2ff82fa8", - "metadata": {}, - "source": [ - "- there are a lot of very convenient functions available to make your life easier\n", - "- to get a comprehensive summary of your whole table, you can use the subfunction `.describe()`\n", - "- this produces a new dataframe with the summary statistics\n", - "- in here, there are no row numbers anymore, but named rows\n", - "- to extract a specific row, you have to use the particular name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffdce51a", - "metadata": {}, - "outputs": [], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "567c6dba", - "metadata": {}, - "outputs": [], - "source": [ - "summary_stats = df.describe()\n", - "summary_stats.loc['50%', :] # this is the median for each column\n", - "# you can also write: df.describe().loc['50%',:]" - ] - }, - { - "cell_type": "markdown", - "id": "366db8b4", - "metadata": {}, - "source": [ - "- it makes sense to write dataframes to file for later use if you changed something\n", - "- to write a csv file, use the dataframe subfunction `.to_csv()`\n", - "- we will not require an index (it would be an ascending number in the first column)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ba49c5b", - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv('data/gfmt_sleep_with_insomnia.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "c5d345a6", - "metadata": {}, - "source": [ - "## Tidy data\n", - "- usually human readable data tables differ from computationally convenient tables\n", - "- typically tables are rarely used to represent data, in particular large datasets\n", - "- you can always reformat tables, but it is not fun and consumes a lot of time and code\n", - "- better having a 'standardized' concept:\n", - " - a column for each parameter \n", - " - a row for each data point\n", - " - a table for each type of observation\n", - "- if you want to read more about it: http://dx.doi.org/10.18637/jss.v059.i10" - ] - }, - { - "cell_type": "markdown", - "id": "59db8995", - "metadata": {}, - "source": [ - "- pandas offers the option to aggregate data with the subfunction `.groupby()`\n", - "- this creates a new dataframe with a variety of standard function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a7ed13f", - "metadata": {}, - "outputs": [], - "source": [ - "grouped = df.groupby('insomnia')\n", - "print(grouped.size())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0c99985", - "metadata": {}, - "outputs": [], - "source": [ - "grouped.median(numeric_only=True)" - ] - }, - { - "cell_type": "markdown", - "id": "cc0455f8", - "metadata": {}, - "source": [ - "- the grouping column has been converted into a named row index\n", - "- should you require to revert this, use the subfunction `reset_index()` and the parameter will be reported as a separate column as before" - ] - }, - { - "cell_type": "markdown", - "id": "8fb8111b", - "metadata": {}, - "source": [ - "## Tidying up a dataset\n", - "- as outlined before, humanly readable data is rarely easy to handle, so called `wide format`\n", - "- the following code will provide an example on how to:\n", - " - read in a raw data file\n", - " - re-assemble a table with one parameter per column. We will use the function `melt()` for that. This is called `long format`\n", - " - calculate general statistics\n", - "- the data comes from Reeves et al. 2012 in Developmental Cell https://doi.org/10.1016/j.devcel.2011.12.007" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1404382", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../data/reeves_gradient_width_various_methods.csv', comment='#', header=[0,1])\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "981f129a", - "metadata": {}, - "outputs": [], - "source": [ - "# The first row of column headings contains the genotypes.\n", - "# The second row contains the staining/visualization method.\n", - "# The data give the computed gradient width as parametrized by\n", - "# sigma (see the original paper).\n", - "\n", - "df.columns.names = ['genotype', 'method']\n", - "df = pd.melt(df, value_name='gradient width')\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19a814b9", - "metadata": {}, - "outputs": [], - "source": [ - "# removing n/a values\n", - "df = df.dropna()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "775fd807", - "metadata": {}, - "outputs": [], - "source": [ - "df.groupby(['genotype', 'method']).describe()" - ] - }, { "cell_type": "markdown", "id": "3766d5b1", "metadata": {}, "source": [ - "# Exercises\n", + "# Lesson 07\n", "\n", - "> In this set of exercises, we will combine multiple libraries that you learned about today as well as knowledge you gained before" + "## Exercises\n", + "\n", + "In this set of exercises, we will combine multiple libraries that were covered in this lessons, as well as knowledge gained from previous lessons." ] }, { @@ -1056,7 +19,7 @@ "source": [ "- create a random RNA sequence with a length of 1000 nucleotides using the 4 standard nucleotides\n", "- use a list comprehension\n", - "- return the sequence as a string" + "- return the sequence as a string\n" ] }, { @@ -1198,7 +161,7 @@ "id": "0be4abb6", "metadata": {}, "source": [ - "## Numpy\n", + "### Numpy\n", "- let's focus on numpy arrays for a bit" ] }, @@ -1292,12 +255,12 @@ "id": "7bf0de56", "metadata": {}, "source": [ - "## Pandas\n", + "### Pandas\n", "\n", - "- load the provided csv file ```../data/iris.data```\n", + "- load the provided csv file `../data/iris.data`\n", "- Display the first 5 rows\n", "- Show column names\n", - "- Get basic summary statistics" + "- Get basic summary statistics\n" ] }, { @@ -1375,8 +338,9 @@ "id": "6f336b44", "metadata": {}, "source": [ - "## Bonus task\n", - "- try to tidy up the iris dataset" + "### Bonus task\n", + "\n", + "- try to tidy up the iris dataset\n" ] }, { diff --git a/solutions/solutions_08.ipynb b/solutions/solutions_08.ipynb index 0164c7b..9c2cf25 100644 --- a/solutions/solutions_08.ipynb +++ b/solutions/solutions_08.ipynb @@ -1,826 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "a60ff099", - "metadata": {}, - "source": [ - "# Lesson 8\n", - "\n", - "## In this lesson, we will learn about the following topics:\n", - "- Biopython\n", - "- Visualization with:\n", - " - Matplotlib\n", - " - plotnine\n", - "\n", - "## Recap of previous lesson\n", - "- external libraries:\n", - " - random\n", - " - gzip\n", - " - argparse\n", - " - math\n", - " - re\n", - " - numpy\n", - " - pandas\n", - "- tidy data" - ] - }, - { - "cell_type": "markdown", - "id": "c11c55c5", - "metadata": {}, - "source": [ - "## Biopython\n", - "- Biopython is a collection of tools for computational biology and bioinformatics\n", - "- Usually, you will **not** load the whole library but specific functions based on your needs\n", - "\n", - "## The sequence object\n", - "- DNA/RNA/Protein sequences are strings\n", - "- `seq` object acts mostly like a string, but is immutable by default\n", - "- main differences:\n", - " - different set of methods, i.e. `seq.translate()` or `seq.reverse_complement()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29780122", - "metadata": {}, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install biopython" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ff2fc38", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio.Seq import Seq\n", - "\n", - "my_seq = Seq(\"AGTACACTGGTA\")\n", - "print(my_seq)\n", - "print(my_seq.translate())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d354504d", - "metadata": {}, - "outputs": [], - "source": [ - "print(my_seq.reverse_complement())" - ] - }, - { - "cell_type": "markdown", - "id": "62bec9aa", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "source": [ - "- `seq` objects can be converted into a string with the `str()` function\n", - "- can be printed to screen directly\n", - "- f-strings and placeholders are compatible" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49d36f2d", - "metadata": {}, - "outputs": [], - "source": [ - "protein_seq = Seq(\"EVRNAK\")\n", - "print(protein_seq)" - ] - }, - { - "cell_type": "markdown", - "id": "48d793f6", - "metadata": {}, - "source": [ - "- `.transcribe()` replaces \"T\" with \"U\"\n", - " - it does NOT create a reverse complement sequence\n", - "- `.back_transcribe()` reverts the above outcome\n", - "- `.translate()` translates to protein seq and works with RNA and DNA\n", - " - default code table is standard genetic code, but can be defined otherwise, i. e. \"Bacterial\" for bacterial DNA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97fdf8a2", - "metadata": {}, - "outputs": [], - "source": [ - "coding_dna = Seq(\"ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG\")\n", - "print(coding_dna.translate())\n", - "print(coding_dna.translate(table=\"Vertebrate Mitochondrial\"))" - ] - }, - { - "cell_type": "markdown", - "id": "a104aa86", - "metadata": {}, - "source": [ - "- You can use the biopython functions directly on strings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8726a88c", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate\n", - "my_string = \"GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG\"\n", - "print('input sequence ', my_string)\n", - "print('reverse complement ', reverse_complement(my_string))\n", - "print('transcribed seq ', transcribe(my_string))\n", - "print('back-transcribed seq', back_transcribe(transcribe(my_string)))\n", - "print('translated seq ', translate(my_string))" - ] - }, - { - "cell_type": "markdown", - "id": "ef60e50d", - "metadata": {}, - "source": [ - "# Sequence I/O (input / output)\n", - "- module to read in common biological data formats (except gff files)\n", - "- main function `.parse()`\n", - " - expects filename\n", - " - expects format type (does **not** do guessing)\n", - " - creates an iterator over SeqRecord objects\n", - "- full list of file types: http://biopython.org/wiki/SeqIO\n", - "- compatible with list comprehensions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e53f83f7", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio import SeqIO\n", - "\n", - "identifiers = [seq_record.id for seq_record in SeqIO.parse(\"../data/ls_orchid.gbk\", \"genbank\")]\n", - "print(identifiers[:5])" - ] - }, - { - "cell_type": "markdown", - "id": "82a52510", - "metadata": {}, - "source": [ - "- `.parse()` can use a file handle (file data stream) instead of a file name\n", - "- enables you i.e. to read compressed files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04b8e47a", - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"../data/ls_orchid.gbk\") as handle:\n", - " for seq_record in SeqIO.parse(handle, \"genbank\"):\n", - " print(seq_record.id)\n", - " break\n", - "\n", - "import gzip\n", - "with gzip.open(\"../data/ls_orchid.gbk.gz\",'rt') as handle:\n", - " for seq_record in SeqIO.parse(handle, \"genbank\"):\n", - " print(seq_record.id)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "f14fe9e1", - "metadata": {}, - "source": [ - "# SeqRecord object\n", - "- object data type to contain higher level annotation for sequences \n", - "- typically used for complex sequence data, i.e. from genbank\n", - "- basic data type by SeqIO\n", - "- multiple attributes:\n", - " - `.seq` = sequence\n", - " - `.id` = primary ID, i.e. accession number\n", - " - `.name` = common name\n", - " - `.description` = human readable description\n", - " - `.features` = A list of SeqFeature objects with more structured information about the features on a sequence (e.g. position of genes on a genome, or domains on a protein sequence).\n", - " \n", - "- to access a file with **only one** record, you can use `SeqIO.read(, )` to get the content\n", - "- to access a file with **multiple** records, use `SeqIO.parse(, )` to get the content\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "786da2e3", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio import SeqIO\n", - "for record in SeqIO.parse(\"../data/NC_005816.gb\", \"genbank\"):\n", - " print(record.id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fc50149", - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"../data/NC_005816.gb\", 'r') as infile:\n", - " for e, line in enumerate(infile):\n", - " print(line, end='')\n", - " if e > 10: break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c7354d7", - "metadata": {}, - "outputs": [], - "source": [ - "for record in SeqIO.parse(\"../data/NC_005816.gb\", \"genbank\"):\n", - " entry = record\n", - " print('Sequence ', record.seq[:50])\n", - " print('Record ID ', record.id)\n", - " print('Description ', record.description)\n", - " print('No. of feat.', len(record.features))\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "0eb9a874", - "metadata": {}, - "source": [ - "## SeqFeature object\n", - "- describes more abstract information about a sequence\n", - "- list of common attributes:\n", - " - `.type` = textual description, i.e. \"gene\" or \"CDS\"\n", - " - `.location` (with sub-attributes like `.strand`)\n", - " - `.qualifiers` = dict with specific info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c07fd19", - "metadata": {}, - "outputs": [], - "source": [ - "for e, feat in enumerate(entry.features):\n", - " print(e, feat.type)\n", - " if feat.type != 'source':\n", - " print('feat. location ', feat.location)#circular genome and spanning position 0\n", - " print('feat. information', feat.qualifiers)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "0fc324f7", - "metadata": {}, - "source": [ - "# Accessing NCBI\n", - "- WARNING: NCBI can and will block your institute, if you misuse this, even unintentionally, but they will try to reach you first.\n", - "- NCBI guidelines: https://www.ncbi.nlm.nih.gov/books/NBK25497/\n", - "- In brief:\n", - " - For more than 100 consecutive requests in a row, do that at a weekend or outside of US peak times\n", - " - No more than 3 queries per second\n", - " - **IMPORTANT**: always provide a valid email address for contacting you! \n", - " - Entrez allows you to search and download large amounts of data. Save those files rather than downloading them over and over again!\n", - " - Use alternatives to download large datasets, i.e. ftp server for all bacterial genomes etc.\n" - ] - }, - { - "cell_type": "markdown", - "id": "a65f491c", - "metadata": {}, - "source": [ - "- Allows programmatic database search similar to website\n", - "- Multiple databases are supported, i.e. nucleotide, pubmed\n", - "- Will return identifiers for downloading / accessing data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d52bf4d", - "metadata": {}, - "outputs": [], - "source": [ - "from Bio import SeqIO\n", - "from Bio import Entrez\n", - "Entrez.email = \"finstermeier@mpusp.mpg.de\"\n", - "handle = Entrez.esearch(db=\"pubmed\", term=\"biopython\")\n", - "record = Entrez.read(handle)\n", - "print(record[\"IdList\"][:5])" - ] - }, - { - "cell_type": "markdown", - "id": "e428202c", - "metadata": {}, - "source": [ - "- Available databases: https://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.T._entrez_unique_identifiers_ui/?report=objectonly\n", - "- Keywords for search term can be found here: https://www.ncbi.nlm.nih.gov/books/NBK49540/\n", - "- Overview of available databases and parameters:\n", - " - https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d94519b6", - "metadata": {}, - "outputs": [], - "source": [ - "handle = Entrez.esearch(db=\"nucleotide\", term=\"Cypripedioideae[Orgn] AND matK[Gene]\", idtype=\"acc\")\n", - "record = Entrez.read(handle)\n", - "print(record[\"IdList\"][:5])" - ] - }, - { - "cell_type": "markdown", - "id": "4fd85ecd", - "metadata": {}, - "source": [ - "# Downloading from NCBI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d794ccd", - "metadata": {}, - "outputs": [], - "source": [ - "with Entrez.efetch(db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291\") as handle:\n", - " seq_record = SeqIO.read(handle, \"gb\") #using \"gb\" as an alias for \"genbank\"\n", - " print(f\"{seq_record.id} with {len(seq_record.features)} features\")\n", - " \n", - "\n", - "with Entrez.efetch(db=\"nucleotide\",rettype=\"gb\",retmode=\"text\",id=\"6273291, 6273290, 6273289\") as handle:\n", - " for seq_record in SeqIO.parse(handle, \"gb\"):\n", - " print(f\"{seq_record.id}: {seq_record.description[:50]}...\")" - ] - }, - { - "cell_type": "markdown", - "id": "0642bc14", - "metadata": {}, - "source": [ - "# Plotting in Python\n", - "\n", - "- There are multiple plotting libraries in Python, but we will focus on two of them:\n", - " - Matplotlib (very briefly)\n", - " - plotnine\n", - "\n", - "# Matplotlib\n", - "- Matplotlib is a widely used 2D plotting library in Python, which provides a lot of flexibility and customization options for creating a wide variety of plots and visualizations.\n", - "- It is often used for creating static, publication-quality plots, and can be used in combination with other libraries such as NumPy and Pandas for data manipulation and analysis.\n", - "- It is a very low level / basic plotting library and numberous other libraries are built on top of it, i.e. [seaborn](https://seaborn.pydata.org/), [plotnine](https://plotnine.org/), etc.\n", - "\n", - "- great variety of plots predefined in subfunctions\n", - "- has a modular structure for the canvas\n", - "- few commands quickly produce good plots\n", - "- showcase with code pieces: https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.show.html#matplotlib.pyplot.show" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dec55c1", - "metadata": {}, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install matplotlib" - ] - }, - { - "cell_type": "markdown", - "id": "9fb8083f", - "metadata": {}, - "source": [ - "## Line plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "547bfbd0", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.plot([1, 3, 2, 4, 3, 2, 4, 3])\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84468733", - "metadata": {}, - "outputs": [], - "source": [ - "xdata = range(20)\n", - "ydata = [0.0, 0.31, 0.59, 0.81, 0.95, 1.0, 0.95, 0.81, 0.59, 0.31, 0.0, -0.31, -0.59, -0.81, -0.95, -1.0, -0.95, -0.81, -0.59, -0.31]\n", - "\n", - "# adjusting line style, color, and adding markers with a specific size\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=12)\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9c7e466", - "metadata": {}, - "outputs": [], - "source": [ - "# adding a second plot on top\n", - "xdata2 = range(0, 21, 4)\n", - "ydata2 = [3.5, 2.5, 2, 2, 2.5, 3.5]\n", - "\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=12)\n", - "plt.plot(xdata2, ydata2, lw=1, color=(1.0, 0, 0))\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62fdc3cf", - "metadata": {}, - "outputs": [], - "source": [ - "# modifying y axis\n", - "fig, ax1 = plt.subplots(figsize=(3, 1.69), dpi=200)\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=4)\n", - "plt.plot(xdata2, ydata2, lw=1, color=(1.0, 0, 0))\n", - "ax1.set_ylim(-1, 5)\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b4515aa", - "metadata": {}, - "outputs": [], - "source": [ - "# adding axis lables, minimizing white space outside of the plot\n", - "# adding grid lines and adjusting font size for x axis\n", - "fig, ax1 = plt.subplots(figsize=(3,1.69), dpi=200)\n", - "plt.plot(xdata, ydata, linewidth=1, linestyle=':', color='green', marker='o', markersize=4)\n", - "plt.plot(xdata2, ydata2, lw=.5, color=(1.0, 0, 0))\n", - "plt.suptitle('plotting something cool', size='small')\n", - "ax1.set_xlabel('x values')\n", - "ax1.set_ylabel('y axis')\n", - "plt.grid(True)\n", - "plt.xticks(fontsize=5)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8cc81f3", - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "\n", - "xdata = [random.random() for i in range(100)]\n", - "ydata = [random.random() for i in range(100)]\n", - "fig, ax1 = plt.subplots(figsize=(6, 3.38), dpi=100)\n", - "\n", - "plt.scatter(xdata, ydata)\n", - "\n", - "ax1.set_xlabel('x values')\n", - "ax1.set_ylabel('y axis')\n", - "plt.grid(True)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "plt.close()" - ] - }, - { - "cell_type": "markdown", - "id": "fc62021f", - "metadata": {}, - "source": [ - "# plotnine\n", - "- plotnine implements the [Grammar of Graphics](https://towardsdatascience.com/a-comprehensive-guide-to-the-grammar-of-graphics-for-effective-visualization-of-multi-dimensional-1f92b4ed4149/):\n", - "\n", - " - **Data**: a DataFrame\n", - " - **Mappings**: `aes(x=..., y=..., color=..., ...)`\n", - " - **Geometries**: `geom_point()`, `geom_line()`, ...\n", - " - **Scales**: how data values map to aesthetics\n", - " - **Facets**: small multiples\n", - " - **Themes**: styling\n", - "\n", - "- Pattern:\n", - "\n", - "```python\n", - "(ggplot(df, aes('x', 'y'))\n", - " + geom_point()\n", - " + labs(title='...')\n", - " + theme_minimal()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93355b2a", - "metadata": {}, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install plotnine" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a28e772", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from plotnine import (\n", - " ggplot, aes,\n", - " geom_point, geom_col, geom_histogram, geom_smooth,\n", - " facet_wrap, facet_grid,\n", - " labs, theme_minimal, theme, element_text, element_blank,\n", - " scale_color_brewer, scale_fill_brewer,\n", - " coord_flip\n", - ")\n", - "\n", - "# Reproducible toy dataset\n", - "rng = np.random.default_rng(42)\n", - "\n", - "n = 240\n", - "df = pd.DataFrame({\n", - " \"x\": rng.normal(0, 1, n),\n", - " \"group\": rng.choice([\"A\", \"B\", \"C\"], size=n, replace=True),\n", - " \"category\": rng.choice([\"cat1\", \"cat2\"], size=n, replace=True),\n", - "})\n", - "\n", - "# Make y depend on x and group to show structure\n", - "group_effect = df[\"group\"].map({\"A\": 0.0, \"B\": 1.0, \"C\": -1.0}).astype(float).to_numpy()\n", - "df[\"y\"] = 2.0 * df[\"x\"].to_numpy() + group_effect + rng.normal(0, 0.7, n)\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "189d4f50", - "metadata": {}, - "source": [ - "## Scatter plot (mapping aesthetics)\n", - "\n", - "- `aes()` defines **mappings** from columns to plot attributes\n", - "- `geom_point()` draws points" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2d9922b", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"x\", \"y\", color=\"group\"))\n", - " + geom_point(alpha=0.7)\n", - " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(title=\"Scatter: y vs x, colored by group\", x=\"x\", y=\"y\")\n", - " + theme_minimal()\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "1561cb11", - "metadata": {}, - "source": [ - "## Add a trend line\n", - "\n", - "`geom_smooth()` fits a model per group (by default) when `color`/`group` is mapped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44322e7d", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"x\", \"y\", color=\"group\"))\n", - " + geom_point(alpha=0.35)\n", - " + geom_smooth(method=\"lm\", se=False)\n", - " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(title=\"Scatter + linear fit per group\", x=\"x\", y=\"y\")\n", - " + theme_minimal()\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ac21bb38", - "metadata": {}, - "source": [ - "## Summarize data, then plot\n", - "\n", - "Process data with pandas, then plot with plotnine.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0644b2a3", - "metadata": {}, - "outputs": [], - "source": [ - "# Summarize mean y by group\n", - "df_mean = (\n", - " df.groupby(\"group\", as_index=False)\n", - " .agg(mean_y=(\"y\", \"mean\"),\n", - " sd_y=(\"y\", \"std\"),\n", - " n=(\"y\", \"size\"))\n", - ")\n", - "df_mean" - ] - }, - { - "cell_type": "markdown", - "id": "7487d364", - "metadata": {}, - "source": [ - "## Bar chart from a summarized table\n", - "\n", - "`geom_col()` uses the y-values you provide (unlike `geom_bar()` which counts)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a65a88ae", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df_mean, aes(\"group\", \"mean_y\", fill=\"group\"))\n", - " + geom_col(width=0.7)\n", - " + scale_fill_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(title=\"Mean y by group\", x=\"group\", y=\"mean(y)\")\n", - " + theme_minimal()\n", - " + theme(legend_position=\"none\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "0c31d460", - "metadata": {}, - "source": [ - "## Histograms\n", - "\n", - "- A histogram shows the frequency distribution of one variable\n", - "- Choose bins (or binwidth) to control the number of bars\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7b9d149", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"y\", fill=\"group\"))\n", - " + geom_histogram(bins=30, alpha=0.6, position=\"identity\")\n", - " + scale_fill_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(title=\"Histogram of y (overlaid by group)\", x=\"y\", y=\"count\")\n", - " + theme_minimal()\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f0f6be2b", - "metadata": {}, - "source": [ - "## Faceting (small multiples)\n", - "\n", - "- separate one figure into multiple sub-figures using one or more variables\n", - "- `facet_wrap('~group')` for one variable\n", - "- `facet_grid('row ~ col')` for two variables\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9759f6bf", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"x\", \"y\"))\n", - " + geom_point(alpha=0.6)\n", - " + geom_smooth(method=\"lm\", se=False)\n", - " + facet_wrap(\"~group\")\n", - " + labs(title=\"Facet by group: scatter + fit\", x=\"x\", y=\"y\")\n", - " + theme_minimal()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f21bc234", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"x\", \"y\", color=\"group\"))\n", - " + geom_point(alpha=0.4)\n", - " + facet_grid(\"category ~ group\")\n", - " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(title=\"Facet grid: category (rows) x group (cols)\", x=\"x\", y=\"y\")\n", - " + theme_minimal()\n", - " + theme(legend_position=\"none\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "14b0014a", - "metadata": {}, - "source": [ - "## Themes and labels\n", - "\n", - "Most styling is done via `theme_*()` and `theme()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdbb2374", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " ggplot(df, aes(\"x\", \"y\", color=\"group\"))\n", - " + geom_point(alpha=0.7)\n", - " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", - " + labs(\n", - " title=\"Styled plot\",\n", - " subtitle=\"Minimal theme + larger title text\",\n", - " caption=\"Synthetic data\"\n", - " )\n", - " + theme_minimal()\n", - " + theme(\n", - " plot_title=element_text(size=16, weight=\"bold\"),\n", - " plot_subtitle=element_text(size=11),\n", - " axis_title=element_text(size=11),\n", - " legend_title=element_blank()\n", - " )\n", - ")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -844,7 +23,9 @@ "id": "ecee966f", "metadata": {}, "source": [ - "# Exercises" + "# Lesson 08\n", + "\n", + "## Exercises\n" ] }, { @@ -852,9 +33,10 @@ "id": "86ebf512", "metadata": {}, "source": [ - "## Biopython\n", + "### Biopython\n", + "\n", "- load the file `data/uniprot_sequences.fasta` and create a list of all sequence IDs and sequences as tuples using Biopython\n", - "- sort the list alphabetically by ID and print the first 5 IDs" + "- sort the list alphabetically by ID and print the first 5 IDs\n" ] }, { @@ -878,9 +60,9 @@ "id": "efe680b0", "metadata": {}, "source": [ - "## Matplotlib\n", + "### Matplotlib\n", "\n", - "- Using the results from the previous exercise, create a bar plot of the counts of homopolymers with a length of 3 across all sequences using Matplotlib." + "- Using the results from the previous exercise, create a bar plot of the counts of homopolymers with a length of 3 across all sequences using Matplotlib.\n" ] }, { @@ -892,11 +74,11 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "counts = {nt*3:0 for nt in \"ACGT\"}\n", + "counts = {nt * 3: 0 for nt in \"ACGT\"}\n", "for record in SeqIO.parse(\"../data/NC_005816.gb\", \"genbank\"):\n", " for i in range(0, len(record.seq)):\n", - " if record.seq[i:i+3] in counts:\n", - " counts[record.seq[i:i+3]] += 1\n", + " if record.seq[i : i + 3] in counts:\n", + " counts[record.seq[i : i + 3]] += 1\n", "\n", "plt.bar(counts.keys(), counts.values())\n", "plt.xlabel(\"homopolymer\")\n", @@ -909,9 +91,8 @@ "id": "3a73b6ca", "metadata": {}, "source": [ - "\n", "- use the `random` library to create a list of 100 random numbers from a Gauss distribution (mu=0.5, sigma=0.15), and plot a histogram of these numbers using matplotlib\n", - "- set a title and the axis labels for the plot" + "- set a title and the axis labels for the plot\n" ] }, { @@ -923,9 +104,9 @@ "source": [ "import random\n", "\n", - "values = [random.gauss(.5, 0.15) for i in range(100)]\n", + "values = [random.gauss(0.5, 0.15) for i in range(100)]\n", "\n", - "plt.hist(values, bins=10, color='skyblue', edgecolor='black')\n", + "plt.hist(values, bins=10, color=\"skyblue\", edgecolor=\"black\")\n", "plt.title(\"Histogram of 100 Random Numbers from a Gauss Distribution\")\n", "plt.xlabel(\"Value\")\n", "plt.ylabel(\"Frequency\")\n", @@ -937,9 +118,9 @@ "id": "7b7baea3", "metadata": {}, "source": [ - "## Plotnine\n", + "### Plotnine\n", "\n", - "- load the file `../data/iris.data` into a pandas DataFrame, and create a scatter plot of septal length vs. septal width using plotnine, coloring the points by species" + "- load the file `../data/iris.data` into a pandas DataFrame, and create a scatter plot of septal length vs. septal width using plotnine, coloring the points by species\n" ] }, { @@ -951,19 +132,22 @@ "source": [ "import pandas as pd\n", "\n", - "from plotnine import (\n", - " ggplot, aes,\n", - " geom_point, labs, theme_minimal, scale_color_brewer)\n", + "from plotnine import ggplot, aes, geom_point, labs, theme_minimal, scale_color_brewer\n", "\n", - "df = pd.read_csv('../data/iris.data')\n", + "df = pd.read_csv(\"../data/iris.data\")\n", "\n", "\n", - "(ggplot(df, aes(\"septal_length\", \"septal_width\", color=\"species\"))\n", - "+ geom_point(alpha=0.7)\n", - "+ scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", - "+ labs(title=\"Scatter: septal length vs width, colored by group\", x=\"septal length\", y=\"septal width\")\n", - "+ theme_minimal())\n", - "\n" + "(\n", + " ggplot(df, aes(\"septal_length\", \"septal_width\", color=\"species\"))\n", + " + geom_point(alpha=0.7)\n", + " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", + " + labs(\n", + " title=\"Scatter: septal length vs width, colored by group\",\n", + " x=\"septal length\",\n", + " y=\"septal width\",\n", + " )\n", + " + theme_minimal()\n", + ")" ] } ], From 23aaa0a817c6bf9b7d9e3808eceb67e4e780006c Mon Sep 17 00:00:00 2001 From: m-jahn Date: Thu, 23 Apr 2026 09:51:14 +0200 Subject: [PATCH 2/5] fix: applied black autoformatting to all notebooks --- lessons/lesson_01.ipynb | 22 ++-- lessons/lesson_02.ipynb | 25 ++-- lessons/lesson_03.ipynb | 14 +-- lessons/lesson_04.ipynb | 112 +++++++++-------- lessons/lesson_06.ipynb | 124 +++++++++++-------- lessons/lesson_07.ipynb | 234 +++++++++++++++++++---------------- lessons/lesson_08.ipynb | 159 ++++++++++++++++-------- solutions/solutions_01.ipynb | 12 +- solutions/solutions_02.ipynb | 14 ++- solutions/solutions_03.ipynb | 6 +- solutions/solutions_04.ipynb | 32 ++--- solutions/solutions_06.ipynb | 83 ++++++++----- solutions/solutions_07.ipynb | 44 ++++--- 13 files changed, 515 insertions(+), 366 deletions(-) diff --git a/lessons/lesson_01.ipynb b/lessons/lesson_01.ipynb index dda519a..fda478e 100644 --- a/lessons/lesson_01.ipynb +++ b/lessons/lesson_01.ipynb @@ -181,7 +181,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"Hello World!\") # Code before the hash sign will be executed" + "print(\"Hello World!\") # Code before the hash sign will be executed" ] }, { @@ -210,7 +210,7 @@ "source": [ "a = 3\n", "b = 5\n", - "a + b**2 # this means to the power of 2" + "a + b**2 # this means to the power of 2" ] }, { @@ -322,7 +322,7 @@ "outputs": [], "source": [ "a = 2 # int\n", - "b = 3.14 # float\n", + "b = 3.14 # float\n", "\n", "print(\"a is of type\", type(a))\n", "print(\"b is of type\", type(b))\n", @@ -419,8 +419,8 @@ "outputs": [], "source": [ "# Note how we are using a list defined above in a prior Jupyter cell!\n", - "print(list_num[2]) # indexing\n", - "print(list_num[1:3]) # slicing\n", + "print(list_num[2]) # indexing\n", + "print(list_num[1:3]) # slicing\n", "# Note how we are using a list defined above in a prior Jupyter cell!\\n" ] }, @@ -557,7 +557,7 @@ "import numpy as np\n", "\n", "# series of numbers defined by start, stop, step\n", - "x = np.arange(0, 4*np.pi, 0.1)\n", + "x = np.arange(0, 4 * np.pi, 0.1)\n", "y = np.sin(x)\n", "plt.plot(x, y)\n", "plt.show()" @@ -630,7 +630,7 @@ "# what is wrong here? Fix it.\n", "\n", "a = 12\n", - "b = '23'\n", + "b = \"23\"\n", "print(a + b)" ] }, @@ -655,7 +655,7 @@ "outputs": [], "source": [ "# Solution\n", - "a = 30\n" + "a = 30" ] }, { @@ -671,7 +671,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] }, { @@ -688,7 +688,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] }, { @@ -706,7 +706,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] } ], diff --git a/lessons/lesson_02.ipynb b/lessons/lesson_02.ipynb index e8c0935..a8e50e7 100644 --- a/lessons/lesson_02.ipynb +++ b/lessons/lesson_02.ipynb @@ -587,12 +587,12 @@ "metadata": {}, "outputs": [], "source": [ - "seq = 'GTACCTTGATTTCGTAA'\n", + "seq = \"GTACCTTGATTTCGTAA\"\n", "\n", - "print(seq.find('GTA'))\n", - "print(seq.find('CCT'))\n", - "print(seq.find('GTA', 1))\n", - "print(seq.find('GTA', 1, 10))" + "print(seq.find(\"GTA\"))\n", + "print(seq.find(\"CCT\"))\n", + "print(seq.find(\"GTA\", 1))\n", + "print(seq.find(\"GTA\", 1, 10))" ] }, { @@ -608,7 +608,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(seq.rfind('TTT'))" + "print(seq.rfind(\"TTT\"))" ] }, { @@ -624,7 +624,7 @@ "metadata": {}, "outputs": [], "source": [ - "seq.count('TA')" + "seq.count(\"TA\")" ] }, { @@ -667,8 +667,8 @@ "print(seq.strip(\"T\")) # removes nothing\n", "print(seq.lstrip(\"TAGC\")) # removes all leading letters matching the query\n", "print(seq.rstrip(\"tagc\")) # removes all trailing letters matching the query\n", - "print(seq.removeprefix(\"GtAc\")) # removes exact prefix\n", - "print(seq.removesuffix(\"GTaa\")) # removes exact suffix" + "print(seq.removeprefix(\"GtAc\")) # removes exact prefix\n", + "print(seq.removesuffix(\"GTaa\")) # removes exact suffix" ] }, { @@ -786,8 +786,7 @@ "# whole divider\n", "\n", "\n", - "# Remainder\n", - "\n" + "# Remainder" ] }, { @@ -818,7 +817,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] }, { @@ -848,7 +847,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] } ], diff --git a/lessons/lesson_03.ipynb b/lessons/lesson_03.ipynb index 9792fe0..65fe4e8 100644 --- a/lessons/lesson_03.ipynb +++ b/lessons/lesson_03.ipynb @@ -443,7 +443,7 @@ "source": [ "# Solution\n", "dna1 = \"ATTATTAGGACCACA\"\n", - "dna2 = \"ATTATTAGGAACACA\"\n" + "dna2 = \"ATTATTAGGAACACA\"" ] }, { @@ -461,7 +461,7 @@ "source": [ "# Solution\n", "seq1 = \"ACGTAGCTAGCTAGCTAGCTA\"\n", - "seq2 = \"GTGCATGCTAGCTAGCTAGCA\"\n" + "seq2 = \"GTGCATGCTAGCTAGCTAGCA\"" ] }, { @@ -479,7 +479,7 @@ "outputs": [], "source": [ "# Solution\n", - "seq = \"ATGGACGTAGTCGGCCCGTGAAAGCGATCGATCG\"\n" + "seq = \"ATGGACGTAGTCGGCCCGTGAAAGCGATCGATCG\"" ] }, { @@ -505,7 +505,7 @@ "source": [ "# Solution\n", "seq1 = \"GGCTATGCCGCCGTTATACTCGAGACTAAGTAGTC\"\n", - "seq2 = \"GGCTATGCCGCCGTTATATCGAGACTAAAGTAGTC\"\n" + "seq2 = \"GGCTATGCCGCCGTTATATCGAGACTAAAGTAGTC\"" ] }, { @@ -522,7 +522,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] }, { @@ -539,11 +539,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Solution\n" + "# Solution" ] } ], - "metadata": { + "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "python-course", diff --git a/lessons/lesson_04.ipynb b/lessons/lesson_04.ipynb index 3e0ea26..d7ca08d 100644 --- a/lessons/lesson_04.ipynb +++ b/lessons/lesson_04.ipynb @@ -68,7 +68,7 @@ "outputs": [], "source": [ "print(my_list * 3)\n", - "my_nested_list = [[1, 2, 3], [3, 2, [1]], 'a', 'b', 'c', 'a', 'a', 'a']\n", + "my_nested_list = [[1, 2, 3], [3, 2, [1]], \"a\", \"b\", \"c\", \"a\", \"a\", \"a\"]\n", "print(my_nested_list)" ] }, @@ -100,7 +100,7 @@ "source": [ "print(my_nested_list)\n", "print(my_nested_list[0])\n", - "print(my_nested_list[-4: ])" + "print(my_nested_list[-4:])" ] }, { @@ -159,16 +159,16 @@ "outputs": [], "source": [ "list1 = [1, 2, 3, 4, 5]\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", "list1.append(6)\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", - "list1.insert(3, 'a')\n", - "print(list1, '\\n')\n", + "list1.insert(3, \"a\")\n", + "print(list1, \"\\n\")\n", "\n", "print(list1.pop(3))\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", "print(list1.remove(6))\n", "print(list1)" @@ -195,19 +195,19 @@ "outputs": [], "source": [ "list1 = [1, 4, 5, 6, 2, 3]\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", "list1.sort()\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", "list1.reverse()\n", - "print(list1, '\\n')\n", + "print(list1, \"\\n\")\n", "\n", "list1 = [1, 4, 5, 6, 2, 3]\n", "print(list1)\n", "list2 = sorted(list1, reverse=True)\n", "print(list1)\n", - "print(list2)\n" + "print(list2)" ] }, { @@ -229,8 +229,8 @@ "source": [ "my_list = [1, 2, 3, 4, 5]\n", "print(my_list)\n", - "my_list[4] = 'e'\n", - "print(my_list)\n" + "my_list[4] = \"e\"\n", + "print(my_list)" ] }, { @@ -251,12 +251,12 @@ "outputs": [], "source": [ "my_list = [1, 2, 3, 4, 5]\n", - "print('original list: ', my_list)\n", + "print(\"original list: \", my_list)\n", "new_list = my_list\n", - "print('apparent copy: ', new_list)\n", - "new_list[0] = 'a'\n", - "print('modified apparent copy:', new_list)\n", - "print('original list: ', my_list)" + "print(\"apparent copy: \", new_list)\n", + "new_list[0] = \"a\"\n", + "print(\"modified apparent copy:\", new_list)\n", + "print(\"original list: \", my_list)" ] }, { @@ -279,12 +279,12 @@ "outputs": [], "source": [ "my_list = [1, 2, 3, 4, 5]\n", - "print('original list: ', my_list)\n", + "print(\"original list: \", my_list)\n", "new_list = my_list.copy()\n", - "print('apparent copy: ', new_list)\n", - "new_list[0] = 'a'\n", - "print('modified apparent copy:', new_list)\n", - "print('original list: ', my_list)" + "print(\"apparent copy: \", new_list)\n", + "new_list[0] = \"a\"\n", + "print(\"modified apparent copy:\", new_list)\n", + "print(\"original list: \", my_list)" ] }, { @@ -310,8 +310,8 @@ "my_tuple = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)\n", "print(my_tuple)\n", "print(type(my_tuple))\n", - "my_second_tuple = ( 1, 2, 3, [])\n", - "my_second_tuple[3].append('a')\n", + "my_second_tuple = (1, 2, 3, [])\n", + "my_second_tuple[3].append(\"a\")\n", "print(my_second_tuple)" ] }, @@ -331,7 +331,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_list = ['a', 'b']\n", + "my_list = [\"a\", \"b\"]\n", "val1, val2 = my_list\n", "print(val1)\n", "print(val2)" @@ -355,10 +355,10 @@ "metadata": {}, "outputs": [], "source": [ - "my_tuple = ( [1, 2, 3], [4, 5, 6] )\n", + "my_tuple = ([1, 2, 3], [4, 5, 6])\n", "list1, list2 = my_tuple\n", "print(list1)\n", - "print( type(list1) )" + "print(type(list1))" ] }, { @@ -379,11 +379,11 @@ "metadata": {}, "outputs": [], "source": [ - "string = 'abracadabra'\n", + "string = \"abracadabra\"\n", "print(string)\n", "print(list(string))\n", "print(tuple(string))\n", - "print(set(string)) # will be explained later on" + "print(set(string)) # will be explained later on" ] }, { @@ -403,10 +403,22 @@ "metadata": {}, "outputs": [], "source": [ - "word = 'Supercalifragilisticexpialidocious'\n", + "word = \"Supercalifragilisticexpialidocious\"\n", "print(len(word))\n", "\n", - "ingredients = [\"1/2 cup sugar\", \"1/2 cup packed brown sugar\", \"3 tablespoons all-purpose flour\", \"1 teaspoon ground cinnamon\", \"1/4 teaspoon ground ginger\", \"1/4 teaspoon ground nutmeg\", \"6 to 7 cups thinly sliced peeled tart apples\", \"1 tablespoon lemon juice\", \"Pastry for double-crust pie (9 inches)\", \"1 tablespoon butter\", \"1 large egg white\"]\n", + "ingredients = [\n", + " \"1/2 cup sugar\",\n", + " \"1/2 cup packed brown sugar\",\n", + " \"3 tablespoons all-purpose flour\",\n", + " \"1 teaspoon ground cinnamon\",\n", + " \"1/4 teaspoon ground ginger\",\n", + " \"1/4 teaspoon ground nutmeg\",\n", + " \"6 to 7 cups thinly sliced peeled tart apples\",\n", + " \"1 tablespoon lemon juice\",\n", + " \"Pastry for double-crust pie (9 inches)\",\n", + " \"1 tablespoon butter\",\n", + " \"1 large egg white\",\n", + "]\n", "print(len(ingredients))" ] }, @@ -433,10 +445,10 @@ "metadata": {}, "outputs": [], "source": [ - "print( range(5) )\n", - "print( type( range(5) ) )\n", - "print( list( range(5) ) )\n", - "print( list( range(5, 10, 2) ) )" + "print(range(5))\n", + "print(type(range(5)))\n", + "print(list(range(5)))\n", + "print(list(range(5, 10, 2)))" ] }, { @@ -456,9 +468,9 @@ "metadata": {}, "outputs": [], "source": [ - "l = ['a', 'b', 'c']\n", - "print( type( enumerate(l) ) )\n", - "print( list( enumerate(l, 55) ) )" + "l = [\"a\", \"b\", \"c\"]\n", + "print(type(enumerate(l)))\n", + "print(list(enumerate(l, 55)))" ] }, { @@ -506,7 +518,7 @@ "metadata": {}, "outputs": [], "source": [ - "genes = ['BRCA1', 'TP53', 'EGFR', 'VEGFA']\n", + "genes = [\"BRCA1\", \"TP53\", \"EGFR\", \"VEGFA\"]\n", "for index, gene in enumerate(genes, start=1):\n", " print(f\"Gene {index}: {gene}\")" ] @@ -570,7 +582,7 @@ " count += 1\n", " if count > 10:\n", " break\n", - "print(F'Count reached {count}. Done.')" + "print(f\"Count reached {count}. Done.\")" ] }, { @@ -623,19 +635,20 @@ "metadata": {}, "outputs": [], "source": [ - "print('First example:')\n", + "print(\"First example:\")\n", "for i in range(10):\n", " if i > 8:\n", - " print('Interrupting first for-loop.')\n", + " print(\"Interrupting first for-loop.\")\n", " break\n", "else:\n", - " print('First for loop finished without any issues.')\n", - " \n", - "print('\\nSecond example:')\n", + " print(\"First for loop finished without any issues.\")\n", + "\n", + "print(\"\\nSecond example:\")\n", "for i in range(10):\n", - " if i > 10: break\n", + " if i > 10:\n", + " break\n", "else:\n", - " print('Second for loop finished without any issues.')\n" + " print(\"Second for loop finished without any issues.\")" ] }, { @@ -982,8 +995,7 @@ "metadata": {}, "outputs": [], "source": [ - "string = \"\"\"CTEYKE,YKEYKEYKE,YKEYKECTE,YKEYKE,YKECTECTE,SKQCTE,YKECTE,CTECTEYKE,YKEYKE,YKE,CTEYKE,CTECTY,YKECTEYKE,GVQYCHSRS,CTECTEYKE,GVQYKEGVQ,YKEYKE,YKEYKEYKE,YKEHMQ,SATYKEYVK,YKECTECMS,YKEYKECTE,YKELTCCTE,YKEHMQ,CTEYKE,DPR,YKEYKECTE,YKERMNYKE,YKECTECTE,WQWCTEYKE,YKEDIH,CTEYKECTE,CTEYKEMDR,YKEYKETIY,CTEHMQYKE,CTE,SGQ,SYCYKEQCT,CTECTECTE,YKEYKE,YKETCRYKE,FNWGVQEMW,YKEYKE,CTEGVQYKE,YKEYCH,YKEYKEGVQ,FCTYCHCTE,FNWCMSYKE,YKE,SATYKE,YKEYKERIH,GVQYKE,MDRYKECTE,YKEYKE,ASMCTEYKE,HMQCTE,DIHYKEYKE,YKEQCT,YKEYKE,YKE,YKEYKE,SATCTE,YAECTECTE,CTECTE,CTEYKE,CTE,YKEYKEYKE,YKEYKEYKE,YKEYKEHMQ,CTE,QCT,CTEFNWCTE,TCSFNW,CTEYKE,YKE,YKEYKEYKE,CTE,SGQGVQ,YKEYKEKKL,CTECTEYKE,DIHYKEYKE,YKECTECTE,YKEYKECTE,HMQYKECTE,YKECTE,YKE,FCTYKECTE,YKECTELVK,YKEYKEGVQ,YKEYKERNM,CTEYKEYKE,IWICTECTE,CTECTEYKE,CTECTEYKE,YKECTEYKE,SKQ,YKE,MDRCTE,YKECTECTE,CTEYKEHMQ\"\"\"\n", - "\n" + "string = \"\"\"CTEYKE,YKEYKEYKE,YKEYKECTE,YKEYKE,YKECTECTE,SKQCTE,YKECTE,CTECTEYKE,YKEYKE,YKE,CTEYKE,CTECTY,YKECTEYKE,GVQYCHSRS,CTECTEYKE,GVQYKEGVQ,YKEYKE,YKEYKEYKE,YKEHMQ,SATYKEYVK,YKECTECMS,YKEYKECTE,YKELTCCTE,YKEHMQ,CTEYKE,DPR,YKEYKECTE,YKERMNYKE,YKECTECTE,WQWCTEYKE,YKEDIH,CTEYKECTE,CTEYKEMDR,YKEYKETIY,CTEHMQYKE,CTE,SGQ,SYCYKEQCT,CTECTECTE,YKEYKE,YKETCRYKE,FNWGVQEMW,YKEYKE,CTEGVQYKE,YKEYCH,YKEYKEGVQ,FCTYCHCTE,FNWCMSYKE,YKE,SATYKE,YKEYKERIH,GVQYKE,MDRYKECTE,YKEYKE,ASMCTEYKE,HMQCTE,DIHYKEYKE,YKEQCT,YKEYKE,YKE,YKEYKE,SATCTE,YAECTECTE,CTECTE,CTEYKE,CTE,YKEYKEYKE,YKEYKEYKE,YKEYKEHMQ,CTE,QCT,CTEFNWCTE,TCSFNW,CTEYKE,YKE,YKEYKEYKE,CTE,SGQGVQ,YKEYKEKKL,CTECTEYKE,DIHYKEYKE,YKECTECTE,YKEYKECTE,HMQYKECTE,YKECTE,YKE,FCTYKECTE,YKECTELVK,YKEYKEGVQ,YKEYKERNM,CTEYKEYKE,IWICTECTE,CTECTEYKE,CTECTEYKE,YKECTEYKE,SKQ,YKE,MDRCTE,YKECTECTE,CTEYKEHMQ\"\"\"" ] }, { diff --git a/lessons/lesson_06.ipynb b/lessons/lesson_06.ipynb index ef24fdb..30bc85b 100644 --- a/lessons/lesson_06.ipynb +++ b/lessons/lesson_06.ipynb @@ -58,13 +58,15 @@ "source": [ "# function w/o arguments\n", "\n", + "\n", "def my_function():\n", - " res = 'Hello from inside the function!' \n", - " print(res) \n", + " res = \"Hello from inside the function!\"\n", + " print(res)\n", " return res\n", - " \n", + "\n", + "\n", "my_function()\n", - "print('Hello back from the outside of the function.')" + "print(\"Hello back from the outside of the function.\")" ] }, { @@ -76,10 +78,11 @@ "source": [ "# basic arguments for function\n", "def ratio(num1, num2):\n", - " '''calculates the ratio of 'num1' vs 'num2'.'''\n", + " \"\"\"calculates the ratio of 'num1' vs 'num2'.\"\"\"\n", " return num1 / num2\n", "\n", - "print( ratio(4, 2) )\n", + "\n", + "print(ratio(4, 2))\n", "\n", "# help text via the built-in `help` function\n", "help(ratio)" @@ -94,10 +97,13 @@ "source": [ "def einstein():\n", " \"\"\"Prints a message but does not return anything.\"\"\"\n", - " print(\"Insanity: doing the same thing over and over again and expecting different results.\")\n", - " \n", + " print(\n", + " \"Insanity: doing the same thing over and over again and expecting different results.\"\n", + " )\n", + "\n", + "\n", "result = einstein()\n", - "print('\\nReturn:', result)" + "print(\"\\nReturn:\", result)" ] }, { @@ -112,9 +118,10 @@ " diff1 = val1 - val2\n", " return diff1\n", "\n", - "print( calc_difference(6, 3) )\n", - "print( calc_difference(val1=3, val2=6) )\n", - "print( calc_difference(val2=3, val1=6) )" + "\n", + "print(calc_difference(6, 3))\n", + "print(calc_difference(val1=3, val2=6))\n", + "print(calc_difference(val2=3, val1=6))" ] }, { @@ -125,13 +132,14 @@ "outputs": [], "source": [ "# function with defaults\n", - "def new_function(arg1, arg2, arg3='test', args4='another one'):\n", + "def new_function(arg1, arg2, arg3=\"test\", args4=\"another one\"):\n", " some_result = arg1 * arg2\n", " return (some_result, arg3, args4)\n", "\n", - "print( new_function(5, 6) )\n", - "print( new_function(8, 7, 'bla') )\n", - "print( new_function(8, 7, 'bla', 'foo') )" + "\n", + "print(new_function(5, 6))\n", + "print(new_function(8, 7, \"bla\"))\n", + "print(new_function(8, 7, \"bla\", \"foo\"))" ] }, { @@ -142,12 +150,13 @@ "outputs": [], "source": [ "# function with defined input and output data types\n", - "def new_function(arg1: list, arg2: int, arg3: str ='test') -> str: #\n", + "def new_function(arg1: list, arg2: int, arg3: str = \"test\") -> str: #\n", " some_result = f\"result for {arg3}: {sum(arg1) * arg2}\"\n", " return some_result\n", "\n", - "print( new_function([2, 3], 3) )\n", - "print( new_function([1, 2], 2, 'c') )" + "\n", + "print(new_function([2, 3], 3))\n", + "print(new_function([1, 2], 2, \"c\"))" ] }, { @@ -157,7 +166,7 @@ "metadata": {}, "outputs": [], "source": [ - "print( new_function(['a','b'], 'a') )" + "print(new_function([\"a\", \"b\"], \"a\"))" ] }, { @@ -168,12 +177,15 @@ "outputs": [], "source": [ "# function with defined input and output data types - example for different return type\n", - "def new_function(arg1: list, arg2: int, arg3: str ='test') -> int:\n", + "def new_function(arg1: list, arg2: int, arg3: str = \"test\") -> int:\n", " some_result = str(arg1) * arg2 + arg3\n", " return some_result\n", "\n", - "print( new_function(['a','b'], 3) )\n", - "print('The anticipated return data type is not being checked and will not raise an error!')" + "\n", + "print(new_function([\"a\", \"b\"], 3))\n", + "print(\n", + " \"The anticipated return data type is not being checked and will not raise an error!\"\n", + ")" ] }, { @@ -188,6 +200,7 @@ " a = a + b\n", " return [a, b]\n", "\n", + "\n", "a = 20\n", "b1 = 5\n", "res = isolated(a, b1)\n", @@ -217,12 +230,14 @@ "outputs": [], "source": [ "# example overwriting a built in function\n", - "print( sum([1, 2, 3]) )\n", + "print(sum([1, 2, 3]))\n", + "\n", "\n", "def sum(a):\n", " return a\n", "\n", - "print( sum([1, 2, 3]) )" + "\n", + "print(sum([1, 2, 3]))" ] }, { @@ -245,10 +260,11 @@ "def multiply(factor1, factor2):\n", " return factor1 * factor2\n", "\n", - "print( multiply(2, 3) )\n", "\n", - "my_dict = {'factor1': 5, 'factor2': 6}\n", - "print( multiply(**my_dict) )" + "print(multiply(2, 3))\n", + "\n", + "my_dict = {\"factor1\": 5, \"factor2\": 6}\n", + "print(multiply(**my_dict))" ] }, { @@ -258,8 +274,8 @@ "metadata": {}, "outputs": [], "source": [ - "my_dict = {'factor1': 2, 'factor2': 3, 'factor3': 4}\n", - "print( multiply(**my_dict) )" + "my_dict = {\"factor1\": 2, \"factor2\": 3, \"factor3\": 4}\n", + "print(multiply(**my_dict))" ] }, { @@ -281,21 +297,23 @@ "source": [ "def function_l():\n", " s = \"I love Minneapolis!\"\n", - " print('Var s inside local function:', s, '\\n')\n", + " print(\"Var s inside local function:\", s, \"\\n\")\n", + "\n", "\n", "def function_g():\n", " global s\n", " s = \"I love Seattle!\"\n", - " print('Var s inside global function:', s, '\\n')\n", + " print(\"Var s inside global function:\", s, \"\\n\")\n", + "\n", "\n", - "s = \"I love NYC!\" \n", - "print('Var s outside any function:', s, '\\n')\n", + "s = \"I love NYC!\"\n", + "print(\"Var s outside any function:\", s, \"\\n\")\n", "\n", "function_l()\n", - "print('Var s outside local function:', s, '\\n')\n", + "print(\"Var s outside local function:\", s, \"\\n\")\n", "\n", "function_g()\n", - "print('Var s outside global function:', s)" + "print(\"Var s outside global function:\", s)" ] }, { @@ -337,9 +355,11 @@ "metadata": {}, "outputs": [], "source": [ - "infile = open('../data/seqs.fas', 'r')\n", + "infile = open(\"../data/seqs.fas\", \"r\")\n", "for line in infile:\n", - " print(repr(line)) # repr() shows the string as it is stored in memory, including special characters like \\n for newlines\n", + " print(\n", + " repr(line)\n", + " ) # repr() shows the string as it is stored in memory, including special characters like \\n for newlines\n", " print(line)\n", "infile.close()" ] @@ -351,10 +371,10 @@ "metadata": {}, "outputs": [], "source": [ - "outfile = open('../data/new_seqs.fas', 'w')\n", - "for e, seq in enumerate(['aaaa', 'cccc', 'gggg', 'tttt']):\n", - " outfile.write(F\">{e}\\n{seq}\\n\")\n", - "outfile.close() " + "outfile = open(\"../data/new_seqs.fas\", \"w\")\n", + "for e, seq in enumerate([\"aaaa\", \"cccc\", \"gggg\", \"tttt\"]):\n", + " outfile.write(f\">{e}\\n{seq}\\n\")\n", + "outfile.close()" ] }, { @@ -364,9 +384,11 @@ "metadata": {}, "outputs": [], "source": [ - "infile = open('../data/new_seqs.fas', 'r')\n", + "infile = open(\"../data/new_seqs.fas\", \"r\")\n", "for line in infile:\n", - " print(line.strip()) # removes white spaces and line breaks on the right side, i.e. aaaa\\n -> aaaa\n", + " print(\n", + " line.strip()\n", + " ) # removes white spaces and line breaks on the right side, i.e. aaaa\\n -> aaaa\n", "infile.close()" ] }, @@ -388,9 +410,11 @@ "metadata": {}, "outputs": [], "source": [ - "with open('../data/seqs.fas', 'a') as outfile:\n", - " for e, seq in enumerate(['acgt', 'tgca'], start=4): # optional start parameter for enumerate, default is 0\n", - " outfile.write(F\">{e}\\n{seq}\\n\")" + "with open(\"../data/seqs.fas\", \"a\") as outfile:\n", + " for e, seq in enumerate(\n", + " [\"acgt\", \"tgca\"], start=4\n", + " ): # optional start parameter for enumerate, default is 0\n", + " outfile.write(f\">{e}\\n{seq}\\n\")" ] }, { @@ -400,7 +424,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('../data/seqs.fas', 'r') as infile:\n", + "with open(\"../data/seqs.fas\", \"r\") as infile:\n", " for line in infile:\n", " print(line.strip())" ] @@ -422,10 +446,10 @@ "metadata": {}, "outputs": [], "source": [ - "with open('../data/seqs.fas', 'r') as infile:\n", + "with open(\"../data/seqs.fas\", \"r\") as infile:\n", " content = infile.read()\n", "\n", - "print( repr(content) )" + "print(repr(content))" ] }, { diff --git a/lessons/lesson_07.ipynb b/lessons/lesson_07.ipynb index f490c59..5f174ba 100644 --- a/lessons/lesson_07.ipynb +++ b/lessons/lesson_07.ipynb @@ -77,22 +77,31 @@ "outputs": [], "source": [ "import random\n", - "#examples:\n", - "print('Random integer:', random.randint(5, 10)) # a random integer N such that a <= N <= b\n", + "\n", + "# examples:\n", + "print(\n", + " \"Random integer:\", random.randint(5, 10)\n", + ") # a random integer N such that a <= N <= b\n", "\n", "list1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", - "print('Random choice:', random.choice(list1)) # a random element from list\n", + "print(\"Random choice:\", random.choice(list1)) # a random element from list\n", "\n", "random.shuffle(list1)\n", - "print('In-place shuffle:', list1) #Shuffle the sequence x in place\n", + "print(\"In-place shuffle:\", list1) # Shuffle the sequence x in place\n", "\n", - "print('Unique subset:', random.sample(list1, 4)) # a k length list of unique elements from population. random sampling without replacement\n", + "print(\n", + " \"Unique subset:\", random.sample(list1, 4)\n", + ") # a k length list of unique elements from population. random sampling without replacement\n", "\n", - "print('Random float, uniform distribution:', random.random()) # random float number [0.0, 1.0]\n", + "print(\n", + " \"Random float, uniform distribution:\", random.random()\n", + ") # random float number [0.0, 1.0]\n", "\n", - "print('Random float gauss distribution:', random.gauss(mu=0.5, sigma=0.66)) #Gaussian distribution, mu=mean, sigma=standard deviation\n", + "print(\n", + " \"Random float gauss distribution:\", random.gauss(mu=0.5, sigma=0.66)\n", + ") # Gaussian distribution, mu=mean, sigma=standard deviation\n", "\n", - "print('Random sequence:', ''.join( random.choices( list('ACTG'), k=50) ) )" + "print(\"Random sequence:\", \"\".join(random.choices(list(\"ACTG\"), k=50)))" ] }, { @@ -127,17 +136,17 @@ "source": [ "import gzip\n", "\n", - "with gzip.open('../data/my_compressed_data.txt.gz', 'wt') as outfile:\n", - " outfile.write('test data 123456789')\n", + "with gzip.open(\"../data/my_compressed_data.txt.gz\", \"wt\") as outfile:\n", + " outfile.write(\"test data 123456789\")\n", "\n", "try:\n", - " with open('data/my_compressed_data.txt.gz', 'r') as infile:\n", + " with open(\"data/my_compressed_data.txt.gz\", \"r\") as infile:\n", " print(infile.read())\n", "except Exception as e:\n", - " print('Error reading compressed file:', e, '\\n')\n", + " print(\"Error reading compressed file:\", e, \"\\n\")\n", "\n", - "with gzip.open('../data/my_compressed_data.txt.gz', 'rt') as infile:\n", - " print(infile.read())" + "with gzip.open(\"../data/my_compressed_data.txt.gz\", \"rt\") as infile:\n", + " print(infile.read())" ] }, { @@ -169,9 +178,17 @@ "outputs": [], "source": [ "import argparse\n", - "parser = argparse.ArgumentParser(description='this is a test program')\n", - "parser.add_argument('-i', dest='info')\n", - "parser.add_argument('--unit','-u', dest='unit_test', default=False, action='store_true', help='run unit test')\n", + "\n", + "parser = argparse.ArgumentParser(description=\"this is a test program\")\n", + "parser.add_argument(\"-i\", dest=\"info\")\n", + "parser.add_argument(\n", + " \"--unit\",\n", + " \"-u\",\n", + " dest=\"unit_test\",\n", + " default=False,\n", + " action=\"store_true\",\n", + " help=\"run unit test\",\n", + ")\n", "args = parser.parse_args()\n", "\n", "# print(args.info)\n", @@ -214,24 +231,25 @@ "outputs": [], "source": [ "import math\n", - "#examples:\n", - "print(math.pi) # mathematical constant Ο€ = 3.141592…\n", - "print(math.e) # mathematical constant e = 2.718281\n", + "\n", + "# examples:\n", + "print(math.pi) # mathematical constant Ο€ = 3.141592…\n", + "print(math.e) # mathematical constant e = 2.718281\n", "\n", "pi = math.pi\n", "\n", - "print('\\nRound properly:', round(pi, 5) ) # built-in rounding function\n", + "print(\"\\nRound properly:\", round(pi, 5)) # built-in rounding function\n", "\n", - "print('Round up:', math.ceil(2.2) )\n", - "print('Round down:', math.floor(3.5) )\n", - "print('Remainder:', math.fmod(7, 2) ) # 7 % 2\n", - "print('Log:', math.log(256, 2)) # logarithmic function (number, base)\n", - "print('Exponential:', math.pow(2, 10.5) ) # x to the power of y, 2**10.5\n", - "print('Square root:', math.sqrt(81) )\n", - "print('Arc sine:', math.asin(.5) ) # in radians\n", - "print('Sine:', math.sin(pi / 2) )\n", - "print('Degrees:', math.degrees(pi) ) # convert x from radians to degrees\n", - "print('Gamma:', math.gamma(3) ) # Gamma function at x\n" + "print(\"Round up:\", math.ceil(2.2))\n", + "print(\"Round down:\", math.floor(3.5))\n", + "print(\"Remainder:\", math.fmod(7, 2)) # 7 % 2\n", + "print(\"Log:\", math.log(256, 2)) # logarithmic function (number, base)\n", + "print(\"Exponential:\", math.pow(2, 10.5)) # x to the power of y, 2**10.5\n", + "print(\"Square root:\", math.sqrt(81))\n", + "print(\"Arc sine:\", math.asin(0.5)) # in radians\n", + "print(\"Sine:\", math.sin(pi / 2))\n", + "print(\"Degrees:\", math.degrees(pi)) # convert x from radians to degrees\n", + "print(\"Gamma:\", math.gamma(3)) # Gamma function at x" ] }, { @@ -255,15 +273,15 @@ "import re\n", "\n", "# reading file\n", - "with open('../data/towels.txt', 'r') as infile:\n", + "with open(\"../data/towels.txt\", \"r\") as infile:\n", " story = infile.read()\n", "\n", "# create pattern\n", - "pattern = '.*[H|h]itch *[H|h]iker.*'\n", + "pattern = \".*[H|h]itch *[H|h]iker.*\"\n", "regex = re.compile(pattern)\n", "\n", "# search for pattern and print each line with the pattern in it\n", - "print( regex.findall(story) )" + "print(regex.findall(story))" ] }, { @@ -336,17 +354,19 @@ "metadata": {}, "outputs": [], "source": [ - "sequence = '''ATGGATAAGAAATACTCAATAGGCTTAGATATCGGCACAAATAGCGTCGGATGGGCGGTGATCACTGATG\n", + "sequence = \"\"\"ATGGATAAGAAATACTCAATAGGCTTAGATATCGGCACAAATAGCGTCGGATGGGCGGTGATCACTGATG\n", "AATATAAGGTTCCGTCTAAAAAGTTCAAGGTTCTGGGAAATACAGACCGCCACAGTATCAAAAAAAATCT\n", - "TATAGGGGCTCTTTTATTTGACAGTGGAGAGACAGCGGAAGCGACTCGTCTCAAACGGACAGCTCGTAGA'''\n", - "pattern = re.compile('AT[ACT]GG[ACGT]')# represents AA sequence 'IG' = Isoleucine + Glycine\n", + "TATAGGGGCTCTTTTATTTGACAGTGGAGAGACAGCGGAAGCGACTCGTCTCAAACGGACAGCTCGTAGA\"\"\"\n", + "pattern = re.compile(\n", + " \"AT[ACT]GG[ACGT]\"\n", + ") # represents AA sequence 'IG' = Isoleucine + Glycine\n", "\n", - "print('first match', pattern.search(sequence))\n", - "print('match beginning', pattern.match(sequence))\n", - "print('whole string match', pattern.fullmatch(sequence))\n", - "print('list of matches', pattern.findall(sequence))\n", - "print('iterator for matches', pattern.finditer(sequence))\n", - "print('split at matches', pattern.split(sequence))\n" + "print(\"first match\", pattern.search(sequence))\n", + "print(\"match beginning\", pattern.match(sequence))\n", + "print(\"whole string match\", pattern.fullmatch(sequence))\n", + "print(\"list of matches\", pattern.findall(sequence))\n", + "print(\"iterator for matches\", pattern.finditer(sequence))\n", + "print(\"split at matches\", pattern.split(sequence))" ] }, { @@ -377,13 +397,14 @@ "# !{sys.executable} -m pip install numpy\n", "\n", "import numpy as np\n", + "\n", "arr = np.array([1, 2, 3, 4])\n", "print(arr)\n", - "print('array data type:', type(arr))\n", + "print(\"array data type:\", type(arr))\n", "\n", - "print('dtype of array entries:', arr.dtype)\n", + "print(\"dtype of array entries:\", arr.dtype)\n", "\n", - "print('shape of the array:', arr.shape)\n", + "print(\"shape of the array:\", arr.shape)\n", "\n", "# arrays have predefined data types, but those can be converted\n", "arr.astype(float)" @@ -397,11 +418,11 @@ "outputs": [], "source": [ "# lots of common statistical subfunctions\n", - "print('max ', arr.max())\n", - "print('min ', arr.min())\n", - "print('sum ', arr.sum())\n", - "print('mean', arr.mean())\n", - "print('SD ', arr.std())" + "print(\"max \", arr.max())\n", + "print(\"min \", arr.min())\n", + "print(\"sum \", arr.sum())\n", + "print(\"mean\", arr.mean())\n", + "print(\"SD \", arr.std())" ] }, { @@ -412,11 +433,11 @@ "outputs": [], "source": [ "# inversely, you can use the functions from the numpy library and hand over the array\n", - "print('max ', np.max(arr))\n", - "print('min ', np.min(arr))\n", - "print('sum ', np.sum(arr))\n", - "print('mean', np.mean(arr))\n", - "print('SD ', np.std(arr))" + "print(\"max \", np.max(arr))\n", + "print(\"min \", np.min(arr))\n", + "print(\"sum \", np.sum(arr))\n", + "print(\"mean\", np.mean(arr))\n", + "print(\"SD \", np.std(arr))" ] }, { @@ -442,10 +463,10 @@ "outputs": [], "source": [ "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('array ', arr)\n", - "print('inversion ', arr[::-1])\n", - "print('slice ', arr[3:6])\n", - "print('element selection', arr[::3])" + "print(\"array \", arr)\n", + "print(\"inversion \", arr[::-1])\n", + "print(\"slice \", arr[3:6])\n", + "print(\"element selection\", arr[::3])" ] }, { @@ -456,8 +477,8 @@ "outputs": [], "source": [ "# conditional selection\n", - "print('conditional', arr[arr > 5])\n", - "print('index list ', np.where(arr > 5))" + "print(\"conditional\", arr[arr > 5])\n", + "print(\"index list \", np.where(arr > 5))" ] }, { @@ -493,11 +514,11 @@ "outputs": [], "source": [ "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", + "print(\"arr \", arr)\n", "arr2 = arr\n", "arr2[1] = 0\n", - "print('arr2', arr2)\n", - "print('arr ', arr)" + "print(\"arr2\", arr2)\n", + "print(\"arr \", arr)" ] }, { @@ -508,20 +529,20 @@ "outputs": [], "source": [ "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", + "print(\"arr \", arr)\n", "arr2 = arr[::2]\n", "arr2[:] = 50\n", - "print('arr2', arr2)\n", - "print('arr ', arr)\n", + "print(\"arr2\", arr2)\n", + "print(\"arr \", arr)\n", "\n", "# vs a copy of the array\n", "\n", "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])\n", - "print('arr ', arr)\n", + "print(\"arr \", arr)\n", "arr2 = np.copy(arr[::2])\n", "arr2[:] = 50\n", - "print('arr2 (copy)', arr2)\n", - "print('arr ', arr)" + "print(\"arr2 (copy)\", arr2)\n", + "print(\"arr \", arr)" ] }, { @@ -544,8 +565,8 @@ "source": [ "arr = np.zeros(shape=(3, 5, 2))\n", "print(arr)\n", - "print('---')\n", - "arr_new = arr.reshape(5, 6) # creates a copy\n", + "print(\"---\")\n", + "arr_new = arr.reshape(5, 6) # creates a copy\n", "print(arr_new)" ] }, @@ -559,10 +580,10 @@ "# index based access to multidimensional arrays:\n", "arr = np.zeros(shape=(2, 3, 2))\n", "\n", - "arr[1][2] = 10 # Python list method\n", - "print(arr, '\\n')\n", + "arr[1][2] = 10 # Python list method\n", + "print(arr, \"\\n\")\n", "\n", - "arr[1, 2] *= 2 # numpy method\n", + "arr[1, 2] *= 2 # numpy method\n", "print(arr)" ] }, @@ -588,11 +609,14 @@ "x = np.random.random(10000)\n", "\n", "# Sum with Python's for loop\n", + "\n", + "\n", "def python_sum(x):\n", " x_sum = 0.0\n", " for y in x:\n", " x_sum += y\n", - " return(x_sum)\n", + " return x_sum\n", + "\n", "\n", "# Test speed\n", "print(\"For loop:\")\n", @@ -602,11 +626,12 @@ "%timeit sum(x)\n", "\n", "import math\n", + "\n", "print(\"\\nMath sum function:\")\n", "%timeit math.fsum(x)\n", "\n", "print(\"\\nNumpy:\")\n", - "%timeit np.sum(x)\n" + "%timeit np.sum(x)" ] }, { @@ -638,7 +663,7 @@ "outputs": [], "source": [ "# showing first 10 lines of the file, missing data indicated by *\n", - "print('\\n'.join(open('data/gfmt_sleep.csv').read().split('\\n')[:10]))" + "print(\"\\n\".join(open(\"data/gfmt_sleep.csv\").read().split(\"\\n\")[:10]))" ] }, { @@ -653,7 +678,7 @@ "\n", "import pandas as pd\n", "\n", - "df = pd.read_csv('../data/gfmt_sleep.csv', na_values='*')\n", + "df = pd.read_csv(\"../data/gfmt_sleep.csv\", na_values=\"*\")\n", "type(df)" ] }, @@ -692,9 +717,9 @@ "metadata": {}, "outputs": [], "source": [ - "df['age']\n", + "df[\"age\"]\n", "print()\n", - "df['age'][0]" + "df[\"age\"][0]" ] }, { @@ -712,7 +737,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.loc[1,'age']" + "df.loc[1, \"age\"]" ] }, { @@ -730,7 +755,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.loc[df['Ppt No.'] == 42, 'overall percent correct']" + "df.loc[df[\"Ppt No.\"] == 42, \"overall percent correct\"]" ] }, { @@ -748,7 +773,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.loc[df['Ppt No.'] == 42, :]" + "df.loc[df[\"Ppt No.\"] == 42, :]" ] }, { @@ -758,7 +783,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.loc[: , 'overall percent correct']" + "df.loc[:, \"overall percent correct\"]" ] }, { @@ -778,7 +803,7 @@ "outputs": [], "source": [ "# getting all females under the age of 21\n", - "df.loc[(df['age'] < 21) & (df['gender'] == 'f'), :]" + "df.loc[(df[\"age\"] < 21) & (df[\"gender\"] == \"f\"), :]" ] }, { @@ -811,7 +836,7 @@ "outputs": [], "source": [ "# re-applying our True/False array\n", - "df.loc[inds, :]#we want to see all columns" + "df.loc[inds, :] # we want to see all columns" ] }, { @@ -836,7 +861,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['insomnia'] = df['sci'] <= 16\n", + "df[\"insomnia\"] = df[\"sci\"] <= 16\n", "df.head()" ] }, @@ -858,11 +883,11 @@ "metadata": {}, "outputs": [], "source": [ - "#calculating the mean of some columns\n", - "mean_correct_pos = np.mean(df.loc[df['insomnia'], 'overall percent correct'])\n", - "mean_correct_neg = np.mean(df.loc[~df['insomnia'], 'overall percent correct'])\n", - "print(F\"Positive cases : {mean_correct_pos:.2f}\")\n", - "print(F\"Negative controls: {mean_correct_neg:.2f}\")" + "# calculating the mean of some columns\n", + "mean_correct_pos = np.mean(df.loc[df[\"insomnia\"], \"overall percent correct\"])\n", + "mean_correct_neg = np.mean(df.loc[~df[\"insomnia\"], \"overall percent correct\"])\n", + "print(f\"Positive cases : {mean_correct_pos:.2f}\")\n", + "print(f\"Negative controls: {mean_correct_neg:.2f}\")" ] }, { @@ -895,7 +920,7 @@ "outputs": [], "source": [ "summary_stats = df.describe()\n", - "summary_stats.loc['50%', :] # this is the median for each column\n", + "summary_stats.loc[\"50%\", :] # this is the median for each column\n", "# you can also write: df.describe().loc['50%',:]" ] }, @@ -916,7 +941,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv('data/gfmt_sleep_with_insomnia.csv', index=False)" + "df.to_csv(\"data/gfmt_sleep_with_insomnia.csv\", index=False)" ] }, { @@ -951,7 +976,7 @@ "metadata": {}, "outputs": [], "source": [ - "grouped = df.groupby('insomnia')\n", + "grouped = df.groupby(\"insomnia\")\n", "print(grouped.size())" ] }, @@ -995,7 +1020,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('../data/reeves_gradient_width_various_methods.csv', comment='#', header=[0,1])\n", + "df = pd.read_csv(\n", + " \"../data/reeves_gradient_width_various_methods.csv\", comment=\"#\", header=[0, 1]\n", + ")\n", "\n", "df.head()" ] @@ -1012,8 +1039,8 @@ "# The data give the computed gradient width as parametrized by\n", "# sigma (see the original paper).\n", "\n", - "df.columns.names = ['genotype', 'method']\n", - "df = pd.melt(df, value_name='gradient width')\n", + "df.columns.names = [\"genotype\", \"method\"]\n", + "df = pd.melt(df, value_name=\"gradient width\")\n", "\n", "df.head()" ] @@ -1036,7 +1063,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.groupby(['genotype', 'method']).describe()" + "df.groupby([\"genotype\", \"method\"]).describe()" ] }, { @@ -1147,9 +1174,8 @@ "outputs": [], "source": [ "def phred_to_qual(scores):\n", - " '''converts numerical quality scores to ASCII characters as defined for fastq sequence file formats'''\n", - " return ''.join(chr(q + 33) for q in scores)\n", - "\n" + " \"\"\"converts numerical quality scores to ASCII characters as defined for fastq sequence file formats\"\"\"\n", + " return \"\".join(chr(q + 33) for q in scores)" ] }, { diff --git a/lessons/lesson_08.ipynb b/lessons/lesson_08.ipynb index 593a445..691802b 100644 --- a/lessons/lesson_08.ipynb +++ b/lessons/lesson_08.ipynb @@ -141,12 +141,13 @@ "outputs": [], "source": [ "from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate\n", + "\n", "my_string = \"GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG\"\n", - "print('input sequence ', my_string)\n", - "print('reverse complement ', reverse_complement(my_string))\n", - "print('transcribed seq ', transcribe(my_string))\n", - "print('back-transcribed seq', back_transcribe(transcribe(my_string)))\n", - "print('translated seq ', translate(my_string))" + "print(\"input sequence \", my_string)\n", + "print(\"reverse complement \", reverse_complement(my_string))\n", + "print(\"transcribed seq \", transcribe(my_string))\n", + "print(\"back-transcribed seq\", back_transcribe(transcribe(my_string)))\n", + "print(\"translated seq \", translate(my_string))" ] }, { @@ -173,7 +174,9 @@ "source": [ "from Bio import SeqIO\n", "\n", - "identifiers = [seq_record.id for seq_record in SeqIO.parse(\"../data/ls_orchid.gbk\", \"genbank\")]\n", + "identifiers = [\n", + " seq_record.id for seq_record in SeqIO.parse(\"../data/ls_orchid.gbk\", \"genbank\")\n", + "]\n", "print(identifiers[:5])" ] }, @@ -199,7 +202,8 @@ " break\n", "\n", "import gzip\n", - "with gzip.open(\"../data/ls_orchid.gbk.gz\",'rt') as handle:\n", + "\n", + "with gzip.open(\"../data/ls_orchid.gbk.gz\", \"rt\") as handle:\n", " for seq_record in SeqIO.parse(handle, \"genbank\"):\n", " print(seq_record.id)\n", " break" @@ -233,6 +237,7 @@ "outputs": [], "source": [ "from Bio import SeqIO\n", + "\n", "for record in SeqIO.parse(\"../data/NC_005816.gb\", \"genbank\"):\n", " print(record.id)" ] @@ -244,10 +249,11 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"../data/NC_005816.gb\", 'r') as infile:\n", + "with open(\"../data/NC_005816.gb\", \"r\") as infile:\n", " for e, line in enumerate(infile):\n", - " print(line, end='')\n", - " if e > 10: break" + " print(line, end=\"\")\n", + " if e > 10:\n", + " break" ] }, { @@ -259,10 +265,10 @@ "source": [ "for record in SeqIO.parse(\"../data/NC_005816.gb\", \"genbank\"):\n", " entry = record\n", - " print('Sequence ', record.seq[:50])\n", - " print('Record ID ', record.id)\n", - " print('Description ', record.description)\n", - " print('No. of feat.', len(record.features))\n", + " print(\"Sequence \", record.seq[:50])\n", + " print(\"Record ID \", record.id)\n", + " print(\"Description \", record.description)\n", + " print(\"No. of feat.\", len(record.features))\n", " break" ] }, @@ -288,9 +294,11 @@ "source": [ "for e, feat in enumerate(entry.features):\n", " print(e, feat.type)\n", - " if feat.type != 'source':\n", - " print('feat. location ', feat.location)#circular genome and spanning position 0\n", - " print('feat. information', feat.qualifiers)\n", + " if feat.type != \"source\":\n", + " print(\n", + " \"feat. location \", feat.location\n", + " ) # circular genome and spanning position 0\n", + " print(\"feat. information\", feat.qualifiers)\n", " break" ] }, @@ -329,6 +337,7 @@ "source": [ "from Bio import SeqIO\n", "from Bio import Entrez\n", + "\n", "Entrez.email = \"finstermeier@mpusp.mpg.de\"\n", "handle = Entrez.esearch(db=\"pubmed\", term=\"biopython\")\n", "record = Entrez.read(handle)\n", @@ -353,7 +362,9 @@ "metadata": {}, "outputs": [], "source": [ - "handle = Entrez.esearch(db=\"nucleotide\", term=\"Cypripedioideae[Orgn] AND matK[Gene]\", idtype=\"acc\")\n", + "handle = Entrez.esearch(\n", + " db=\"nucleotide\", term=\"Cypripedioideae[Orgn] AND matK[Gene]\", idtype=\"acc\"\n", + ")\n", "record = Entrez.read(handle)\n", "print(record[\"IdList\"][:5])" ] @@ -373,12 +384,16 @@ "metadata": {}, "outputs": [], "source": [ - "with Entrez.efetch(db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291\") as handle:\n", - " seq_record = SeqIO.read(handle, \"gb\") #using \"gb\" as an alias for \"genbank\"\n", + "with Entrez.efetch(\n", + " db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291\"\n", + ") as handle:\n", + " seq_record = SeqIO.read(handle, \"gb\") # using \"gb\" as an alias for \"genbank\"\n", " print(f\"{seq_record.id} with {len(seq_record.features)} features\")\n", - " \n", "\n", - "with Entrez.efetch(db=\"nucleotide\",rettype=\"gb\",retmode=\"text\",id=\"6273291, 6273290, 6273289\") as handle:\n", + "\n", + "with Entrez.efetch(\n", + " db=\"nucleotide\", rettype=\"gb\", retmode=\"text\", id=\"6273291, 6273290, 6273289\"\n", + ") as handle:\n", " for seq_record in SeqIO.parse(handle, \"gb\"):\n", " print(f\"{seq_record.id}: {seq_record.description[:50]}...\")" ] @@ -446,10 +461,33 @@ "outputs": [], "source": [ "xdata = range(20)\n", - "ydata = [0.0, 0.31, 0.59, 0.81, 0.95, 1.0, 0.95, 0.81, 0.59, 0.31, 0.0, -0.31, -0.59, -0.81, -0.95, -1.0, -0.95, -0.81, -0.59, -0.31]\n", + "ydata = [\n", + " 0.0,\n", + " 0.31,\n", + " 0.59,\n", + " 0.81,\n", + " 0.95,\n", + " 1.0,\n", + " 0.95,\n", + " 0.81,\n", + " 0.59,\n", + " 0.31,\n", + " 0.0,\n", + " -0.31,\n", + " -0.59,\n", + " -0.81,\n", + " -0.95,\n", + " -1.0,\n", + " -0.95,\n", + " -0.81,\n", + " -0.59,\n", + " -0.31,\n", + "]\n", "\n", "# adjusting line style, color, and adding markers with a specific size\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=12)\n", + "plt.plot(\n", + " xdata, ydata, linewidth=2, linestyle=\":\", color=\"green\", marker=\"o\", markersize=12\n", + ")\n", "plt.show()\n", "plt.close()" ] @@ -465,7 +503,9 @@ "xdata2 = range(0, 21, 4)\n", "ydata2 = [3.5, 2.5, 2, 2, 2.5, 3.5]\n", "\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=12)\n", + "plt.plot(\n", + " xdata, ydata, linewidth=2, linestyle=\":\", color=\"green\", marker=\"o\", markersize=12\n", + ")\n", "plt.plot(xdata2, ydata2, lw=1, color=(1.0, 0, 0))\n", "plt.show()\n", "plt.close()" @@ -480,7 +520,9 @@ "source": [ "# modifying y axis\n", "fig, ax1 = plt.subplots(figsize=(3, 1.69), dpi=200)\n", - "plt.plot(xdata, ydata, linewidth=2, linestyle=':', color='green', marker='o', markersize=4)\n", + "plt.plot(\n", + " xdata, ydata, linewidth=2, linestyle=\":\", color=\"green\", marker=\"o\", markersize=4\n", + ")\n", "plt.plot(xdata2, ydata2, lw=1, color=(1.0, 0, 0))\n", "ax1.set_ylim(-1, 5)\n", "plt.show()\n", @@ -496,12 +538,14 @@ "source": [ "# adding axis lables, minimizing white space outside of the plot\n", "# adding grid lines and adjusting font size for x axis\n", - "fig, ax1 = plt.subplots(figsize=(3,1.69), dpi=200)\n", - "plt.plot(xdata, ydata, linewidth=1, linestyle=':', color='green', marker='o', markersize=4)\n", - "plt.plot(xdata2, ydata2, lw=.5, color=(1.0, 0, 0))\n", - "plt.suptitle('plotting something cool', size='small')\n", - "ax1.set_xlabel('x values')\n", - "ax1.set_ylabel('y axis')\n", + "fig, ax1 = plt.subplots(figsize=(3, 1.69), dpi=200)\n", + "plt.plot(\n", + " xdata, ydata, linewidth=1, linestyle=\":\", color=\"green\", marker=\"o\", markersize=4\n", + ")\n", + "plt.plot(xdata2, ydata2, lw=0.5, color=(1.0, 0, 0))\n", + "plt.suptitle(\"plotting something cool\", size=\"small\")\n", + "ax1.set_xlabel(\"x values\")\n", + "ax1.set_ylabel(\"y axis\")\n", "plt.grid(True)\n", "plt.xticks(fontsize=5)\n", "plt.tight_layout()\n", @@ -524,8 +568,8 @@ "\n", "plt.scatter(xdata, ydata)\n", "\n", - "ax1.set_xlabel('x values')\n", - "ax1.set_ylabel('y axis')\n", + "ax1.set_xlabel(\"x values\")\n", + "ax1.set_ylabel(\"y axis\")\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n", @@ -578,23 +622,35 @@ "import numpy as np\n", "import pandas as pd\n", "from plotnine import (\n", - " ggplot, aes,\n", - " geom_point, geom_col, geom_histogram, geom_smooth,\n", - " facet_wrap, facet_grid,\n", - " labs, theme_minimal, theme, element_text, element_blank,\n", - " scale_color_brewer, scale_fill_brewer,\n", - " coord_flip\n", + " ggplot,\n", + " aes,\n", + " geom_point,\n", + " geom_col,\n", + " geom_histogram,\n", + " geom_smooth,\n", + " facet_wrap,\n", + " facet_grid,\n", + " labs,\n", + " theme_minimal,\n", + " theme,\n", + " element_text,\n", + " element_blank,\n", + " scale_color_brewer,\n", + " scale_fill_brewer,\n", + " coord_flip,\n", ")\n", "\n", "# Reproducible toy dataset\n", "rng = np.random.default_rng(42)\n", "\n", "n = 240\n", - "df = pd.DataFrame({\n", - " \"x\": rng.normal(0, 1, n),\n", - " \"group\": rng.choice([\"A\", \"B\", \"C\"], size=n, replace=True),\n", - " \"category\": rng.choice([\"cat1\", \"cat2\"], size=n, replace=True),\n", - "})\n", + "df = pd.DataFrame(\n", + " {\n", + " \"x\": rng.normal(0, 1, n),\n", + " \"group\": rng.choice([\"A\", \"B\", \"C\"], size=n, replace=True),\n", + " \"category\": rng.choice([\"cat1\", \"cat2\"], size=n, replace=True),\n", + " }\n", + ")\n", "\n", "# Make y depend on x and group to show structure\n", "group_effect = df[\"group\"].map({\"A\": 0.0, \"B\": 1.0, \"C\": -1.0}).astype(float).to_numpy()\n", @@ -627,7 +683,7 @@ " + scale_color_brewer(type=\"qual\", palette=\"Set2\")\n", " + labs(title=\"Scatter: y vs x, colored by group\", x=\"x\", y=\"y\")\n", " + theme_minimal()\n", - ")\n" + ")" ] }, { @@ -675,11 +731,8 @@ "outputs": [], "source": [ "# Summarize mean y by group\n", - "df_mean = (\n", - " df.groupby(\"group\", as_index=False)\n", - " .agg(mean_y=(\"y\", \"mean\"),\n", - " sd_y=(\"y\", \"std\"),\n", - " n=(\"y\", \"size\"))\n", + "df_mean = df.groupby(\"group\", as_index=False).agg(\n", + " mean_y=(\"y\", \"mean\"), sd_y=(\"y\", \"std\"), n=(\"y\", \"size\")\n", ")\n", "df_mean" ] @@ -809,14 +862,14 @@ " + labs(\n", " title=\"Styled plot\",\n", " subtitle=\"Minimal theme + larger title text\",\n", - " caption=\"Synthetic data\"\n", + " caption=\"Synthetic data\",\n", " )\n", " + theme_minimal()\n", " + theme(\n", " plot_title=element_text(size=16, weight=\"bold\"),\n", " plot_subtitle=element_text(size=11),\n", " axis_title=element_text(size=11),\n", - " legend_title=element_blank()\n", + " legend_title=element_blank(),\n", " )\n", ")" ] diff --git a/solutions/solutions_01.ipynb b/solutions/solutions_01.ipynb index cc45bbd..c75e365 100644 --- a/solutions/solutions_01.ipynb +++ b/solutions/solutions_01.ipynb @@ -59,7 +59,7 @@ "source": [ "# fix: data types need to have correct syntax\n", "\n", - "# `True` and `False` are boolean values in Python, they need to be capitalized \n", + "# `True` and `False` are boolean values in Python, they need to be capitalized\n", "\n", "my_bool = True\n", "print(my_bool)" @@ -75,7 +75,7 @@ "\n", "# b is a string, a is an integer, they cannot be added together without converting one of them to the other type\n", "a = 12\n", - "b = '23'\n", + "b = \"23\"\n", "print(a + int(b))\n", "\n", "# alternative: remove quotes to make b an integer\n", @@ -106,7 +106,7 @@ "a = 30\n", "b = 5.576\n", "\n", - "print(a, b)\n" + "print(a, b)" ] }, { @@ -126,8 +126,8 @@ "my_name = \"Albus Dumbledore\"\n", "my_email = \"albus.dumbledore@hogwarts.edu\"\n", "\n", - "print('Full name:', my_name)\n", - "print('Email:', my_email)" + "print(\"Full name:\", my_name)\n", + "print(\"Email:\", my_email)" ] }, { @@ -148,7 +148,7 @@ "radius = 5.0\n", "pi = 3.1415926\n", "area = pi * radius * radius\n", - "print(\"Area of the circle:\", area)\n" + "print(\"Area of the circle:\", area)" ] }, { diff --git a/solutions/solutions_02.ipynb b/solutions/solutions_02.ipynb index cfdc425..4377e51 100644 --- a/solutions/solutions_02.ipynb +++ b/solutions/solutions_02.ipynb @@ -39,7 +39,7 @@ "print(f\"Whole divider: {i // j}\")\n", "\n", "# Remainder\n", - "print(f\"Remainder: {i % j}\")\n" + "print(f\"Remainder: {i % j}\")" ] }, { @@ -77,8 +77,12 @@ "\n", "print(f\"Query sequence '{query_1}' in sequence:\", query_1 in sequence)\n", "\n", - "print(F\"Right side position of {query_1} (expected -10): -{len(sequence) - sequence.rfind(query_1) - len(query_1)}, expectation met: {len(sequence) - sequence.rfind(query_1) - len(query_1) == 10}\")\n", - "print(F\"Right side position of {query_2} (expected -35): -{len(sequence) - sequence.rfind(query_2) - len(query_2)}, expectation met: {len(sequence) - sequence.rfind(query_2) - len(query_2) == 35}\")" + "print(\n", + " f\"Right side position of {query_1} (expected -10): -{len(sequence) - sequence.rfind(query_1) - len(query_1)}, expectation met: {len(sequence) - sequence.rfind(query_1) - len(query_1) == 10}\"\n", + ")\n", + "print(\n", + " f\"Right side position of {query_2} (expected -35): -{len(sequence) - sequence.rfind(query_2) - len(query_2)}, expectation met: {len(sequence) - sequence.rfind(query_2) - len(query_2) == 35}\"\n", + ")" ] }, { @@ -114,8 +118,8 @@ "MMRKLAILSVSSFLFVEALFQEYQCYGSSSNTRVLNELNYDNAGTNLYNELEMNYYGKQENWYSLKKNSRSLGENDDGNNEDNEKLRKPKHKKLKQPADGNPDPNANPNVDPNANPNVDPNANPNVDPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNANPNAN\n", "\"\"\"\n", "\n", - "header = fasta.split('\\n')[1]\n", - "sequence = fasta.split('\\n')[2]\n", + "header = fasta.split(\"\\n\")[1]\n", + "sequence = fasta.split(\"\\n\")[2]\n", "\n", "print(f\"ID: {header.split()[0][1:]}\")\n", "print(f\"Sequence length: {len(sequence)}\")\n", diff --git a/solutions/solutions_03.ipynb b/solutions/solutions_03.ipynb index de9e14d..b05b387 100644 --- a/solutions/solutions_03.ipynb +++ b/solutions/solutions_03.ipynb @@ -51,8 +51,8 @@ "gc_content_seq1 = (seq1.count(\"G\") + seq1.count(\"C\")) / len(seq1)\n", "gc_content_seq2 = (seq2.count(\"G\") + seq2.count(\"C\")) / len(seq2)\n", "\n", - "print('Seq1 greater GC content than Seq2:', gc_content_seq1 > gc_content_seq2)\n", - "print('Seq1 GC content:', gc_content_seq1, 'Seq2 GC content:', gc_content_seq2)" + "print(\"Seq1 greater GC content than Seq2:\", gc_content_seq1 > gc_content_seq2)\n", + "print(\"Seq1 GC content:\", gc_content_seq1, \"Seq2 GC content:\", gc_content_seq2)" ] }, { @@ -103,9 +103,11 @@ "seq1 = \"GGCTATGCCGCCGTTATACTCGAGACTAAGTAGTC\"\n", "seq2 = \"GGCTATGCCGCCGTTATATCGAGACTAAAGTAGTC\"\n", "\n", + "\n", "def contains_start_codon(seq):\n", " return \"ATG\" in seq # or seq.count(\"ATG\"), or seq.find(\"ATG\") ...\n", "\n", + "\n", "print(contains_start_codon(seq1))\n", "print(contains_start_codon(seq2))" ] diff --git a/solutions/solutions_04.ipynb b/solutions/solutions_04.ipynb index 3a05b1a..9b70895 100644 --- a/solutions/solutions_04.ipynb +++ b/solutions/solutions_04.ipynb @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_sports = ['golf', 'tennis', 'football', 'rugby', 'volleyball']\n", + "my_sports = [\"golf\", \"tennis\", \"football\", \"rugby\", \"volleyball\"]\n", "\n", "print(f\"Number of sports: {len(my_sports)}\")\n", "for sport in my_sports:\n", @@ -54,11 +54,11 @@ "outputs": [], "source": [ "for e, sport in enumerate(my_sports, start=1):\n", - " if e%2 == 0:\n", + " if e % 2 == 0:\n", " print(f\"Every 2nd element: I like to play {sport[0].upper()}{sport[1:]}.\")\n", "\n", "for e, sport in enumerate(my_sports):\n", - " if e%3 == 0:\n", + " if e % 3 == 0:\n", " print(f\"Every 3rd element: I like to play {sport[0].upper()}{sport[1:]}.\")" ] }, @@ -97,7 +97,7 @@ "outputs": [], "source": [ "my_sports.pop(0)\n", - "my_sports.append('basketball')\n", + "my_sports.append(\"basketball\")\n", "\n", "print(my_sports)" ] @@ -119,11 +119,11 @@ "source": [ "second_list = []\n", "second_list = list(my_sports)\n", - "second_list[0] = 'baseball'\n", - "second_list[-1] = 'handball'\n", + "second_list[0] = \"baseball\"\n", + "second_list[-1] = \"handball\"\n", "\n", - "print('first list:', my_sports)\n", - "print('second list:', second_list)" + "print(\"first list:\", my_sports)\n", + "print(\"second list:\", second_list)" ] }, { @@ -178,7 +178,7 @@ "outputs": [], "source": [ "second_list.pop(2)\n", - "print(F\"Length of second list: {len(second_list)}\")" + "print(f\"Length of second list: {len(second_list)}\")" ] }, { @@ -263,11 +263,11 @@ "# this will just result in a string: my_name = ('Luke Skywalker')\n", "# this will break up the single string: my_name = tuple('Luke Skywalker')\n", "\n", - "my_name = ('Luke Skywalker', )\n", + "my_name = (\"Luke Skywalker\",)\n", "\n", "# OR\n", "\n", - "my_name = tuple(['Luke Skywalker'])\n", + "my_name = tuple([\"Luke Skywalker\"])\n", "\n", "print(my_name, type(my_name))" ] @@ -355,8 +355,8 @@ "source": [ "seq = \"ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGTAA\"\n", "\n", - "print(F\"Length of nucleotide sequence: {len(seq)} nt.\")\n", - "print(F\"Length of AA sequence: {len(seq) // 3} aa.\")" + "print(f\"Length of nucleotide sequence: {len(seq)} nt.\")\n", + "print(f\"Length of AA sequence: {len(seq) // 3} aa.\")" ] }, { @@ -400,7 +400,7 @@ "outputs": [], "source": [ "for e, nt in enumerate(seq):\n", - " if seq[e:e+3] == \"ATG\":\n", + " if seq[e : e + 3] == \"ATG\":\n", " print(f\"ATG found at position {e}.\")" ] }, @@ -435,7 +435,7 @@ "count_K = 0\n", "count_SL = 0\n", "count_C = 0\n", - "for elem in string.split(','):\n", + "for elem in string.split(\",\"):\n", " if \"K\" in elem:\n", " count_K += 1\n", " if \"S\" in elem and not \"L\" in elem:\n", @@ -469,7 +469,7 @@ "outputs": [], "source": [ "count = 1\n", - "while count <= 7*7:\n", + "while count <= 7 * 7:\n", " if count % 7 == 0:\n", " print(count)\n", " count += 1" diff --git a/solutions/solutions_06.ipynb b/solutions/solutions_06.ipynb index a61421c..75afffc 100644 --- a/solutions/solutions_06.ipynb +++ b/solutions/solutions_06.ipynb @@ -26,20 +26,21 @@ "outputs": [], "source": [ "def my_calculator(num1, num2, operation):\n", - " '''calculator performing the basic 4 math operations. Operator provided as string, e.g. 'add' or '+'.'''\n", - " if operation in ['add', '+']:\n", + " \"\"\"calculator performing the basic 4 math operations. Operator provided as string, e.g. 'add' or '+'.\"\"\"\n", + " if operation in [\"add\", \"+\"]:\n", " return num1 + num2\n", - " elif operation in ['subtract', '-']:\n", + " elif operation in [\"subtract\", \"-\"]:\n", " return num1 - num2\n", - " elif operation in ['multiply', '*']:\n", + " elif operation in [\"multiply\", \"*\"]:\n", " return num1 * num2\n", - " elif operation in ['divide', '/']:\n", + " elif operation in [\"divide\", \"/\"]:\n", " if num2 == 0:\n", - " return 'Error: division by zero is not allowed.'\n", + " return \"Error: division by zero is not allowed.\"\n", " else:\n", " return num1 / num2\n", "\n", - "print( my_calculator(10, 5, 'add') )" + "\n", + "print(my_calculator(10, 5, \"add\"))" ] }, { @@ -59,21 +60,30 @@ "outputs": [], "source": [ "def test_calculator():\n", - " operations = ['add', '-', 'multiply', '/']\n", + " operations = [\"add\", \"-\", \"multiply\", \"/\"]\n", " num1 = 24\n", " num2 = 6\n", " results = [30, 18, 144, 4]\n", " for e, op in enumerate(operations):\n", " if my_calculator(num1, num2, op) == results[e]:\n", - " print(f\"Test for operation '{op}' passed. {num1} {op} {num2} = {results[e]}\")\n", + " print(\n", + " f\"Test for operation '{op}' passed. {num1} {op} {num2} = {results[e]}\"\n", + " )\n", " else:\n", - " print(f\"Test for operation '{op}' failed. {num1} {op} {num2} should be {results[e]} but got {my_calculator(num1, num2, op)} instead.\")\n", - " \n", + " print(\n", + " f\"Test for operation '{op}' failed. {num1} {op} {num2} should be {results[e]} but got {my_calculator(num1, num2, op)} instead.\"\n", + " )\n", + "\n", " num2 = 0\n", - " if my_calculator(num1, num2, 'divide') == 'Error: division by zero is not allowed.':\n", - " print(f\"Test for division by zero passed. {num1} / {num2} should return an error message.\")\n", + " if my_calculator(num1, num2, \"divide\") == \"Error: division by zero is not allowed.\":\n", + " print(\n", + " f\"Test for division by zero passed. {num1} / {num2} should return an error message.\"\n", + " )\n", " else:\n", - " print(f\"Test for division by zero failed. {num1} / {num2} should return an error message but got {my_calculator(num1, num2, 'divide')} instead.\")\n", + " print(\n", + " f\"Test for division by zero failed. {num1} / {num2} should return an error message but got {my_calculator(num1, num2, 'divide')} instead.\"\n", + " )\n", + "\n", "\n", "test_calculator()" ] @@ -97,10 +107,20 @@ "source": [ "sequence = \"ACGTTCGATatgcgtcagcACTGT\"\n", "\n", - "lookup = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}\n", + "lookup = {\n", + " \"A\": \"T\",\n", + " \"C\": \"G\",\n", + " \"G\": \"C\",\n", + " \"T\": \"A\",\n", + " \"a\": \"t\",\n", + " \"c\": \"g\",\n", + " \"g\": \"c\",\n", + " \"t\": \"a\",\n", + "}\n", + "\n", "\n", "def reverse_complement(seq):\n", - " rc = ''\n", + " rc = \"\"\n", " for nt in seq:\n", " if nt in lookup:\n", " rc += lookup[nt]\n", @@ -109,6 +129,7 @@ " rc = rc[::-1]\n", " return rc\n", "\n", + "\n", "reverse_complement(sequence)" ] }, @@ -130,8 +151,9 @@ "source": [ "seq = \"ATTGCTATCGTTCATGTACGAGACAACCATGCCTACGATTGAGACGAGCGTTGAAGGAAACGAAAGTTAACAGAGCTTCCCGTAAACCGTATCCTCGCCC\"\n", "\n", - "start_codon = 'ATG'\n", - "end_codon = 'TAA'\n", + "start_codon = \"ATG\"\n", + "end_codon = \"TAA\"\n", + "\n", "\n", "def find_orfs(seq):\n", " coordinates = []\n", @@ -145,10 +167,11 @@ " end_index += 1\n", " start_index += 1\n", " # make sure that the ORF is a multiple of 3 nt long / in-frame\n", - " coordinates = [(start, end) for start, end in coordinates if (end-start) % 3 == 0]\n", - " seqs = [seq[start:end+3] for start, end in coordinates]\n", + " coordinates = [(start, end) for start, end in coordinates if (end - start) % 3 == 0]\n", + " seqs = [seq[start : end + 3] for start, end in coordinates]\n", " return seqs\n", "\n", + "\n", "print(find_orfs(seq))" ] }, @@ -170,19 +193,21 @@ "metadata": {}, "outputs": [], "source": [ - "def write_orfs_to_file(seqs, outname='../data/orfs.txt'):\n", - " with open(outname, 'w') as outfile:\n", + "def write_orfs_to_file(seqs, outname=\"../data/orfs.txt\"):\n", + " with open(outname, \"w\") as outfile:\n", " for e, orf in enumerate(seqs):\n", - " outfile.write(F\">orf{e}\\n{orf}\\n\")\n", - " \n", - "def read_orfs_from_file(inname='../data/orfs.txt'):\n", - " with open(inname, 'r') as infile:\n", + " outfile.write(f\">orf{e}\\n{orf}\\n\")\n", + "\n", + "\n", + "def read_orfs_from_file(inname=\"../data/orfs.txt\"):\n", + " with open(inname, \"r\") as infile:\n", " for line in infile:\n", " print(line.strip())\n", - " \n", + "\n", + "\n", "seqs = find_orfs(seq)\n", - "write_orfs_to_file(seqs, outname='../data/orfs.txt')\n", - "read_orfs_from_file(inname='../data/orfs.txt')" + "write_orfs_to_file(seqs, outname=\"../data/orfs.txt\")\n", + "read_orfs_from_file(inname=\"../data/orfs.txt\")" ] } ], diff --git a/solutions/solutions_07.ipynb b/solutions/solutions_07.ipynb index 4f3c9d5..db3d1e5 100644 --- a/solutions/solutions_07.ipynb +++ b/solutions/solutions_07.ipynb @@ -32,7 +32,7 @@ "import random\n", "\n", "length = 1000\n", - "random_sequence = ''.join(random.choices(list('ACGU'), k=length))" + "random_sequence = \"\".join(random.choices(list(\"ACGU\"), k=length))" ] }, { @@ -54,13 +54,13 @@ "source": [ "import re\n", "\n", - "pattern = re.compile('UAA|UAG|UGA')\n", + "pattern = re.compile(\"UAA|UAG|UGA\")\n", "\n", "positions = []\n", "\n", "for p in re.finditer(pattern, random_sequence):\n", " positions.append(p.span())\n", - " \n", + "\n", "print(positions)" ] }, @@ -82,11 +82,11 @@ "source": [ "length = len(random_sequence)\n", "steps = length // 10\n", - "seqs = [random_sequence[i:i+150] for i in range(0, length, steps)]\n", + "seqs = [random_sequence[i : i + 150] for i in range(0, length, steps)]\n", "\n", "print(f\"Whole seq: {random_sequence[:10]} ... {random_sequence[-10:]}\")\n", "print(f\"Seq segments: {seqs[0][:10]} ... {seqs[-1][-10:]}\")\n", - "print(f\"Step size: {steps}\")\n" + "print(f\"Step size: {steps}\")" ] }, { @@ -142,18 +142,20 @@ "source": [ "import gzip\n", "\n", + "\n", "def phred_to_qual(scores):\n", - " '''converts numerical quality scores to ASCII characters as defined for fastq sequence file formats'''\n", - " return ''.join(chr(q + 33) for q in scores)\n", + " \"\"\"converts numerical quality scores to ASCII characters as defined for fastq sequence file formats\"\"\"\n", + " return \"\".join(chr(q + 33) for q in scores)\n", + "\n", "\n", "output = []\n", "for seq, scores in quality_scores:\n", " qual_string = phred_to_qual(scores)\n", " output.append((seq, qual_string))\n", "\n", - "with gzip.open('../data/random_sequences.fastq.gz', 'wt') as outfile:\n", + "with gzip.open(\"../data/random_sequences.fastq.gz\", \"wt\") as outfile:\n", " for i in range(10):\n", - " outfile.write(f\"@ID_{i+1}\\n{output[i][0]}\\n+\\n{output[i][1]}\\n\")" + " outfile.write(f\"@ID_{i+1}\\n{output[i][0]}\\n+\\n{output[i][1]}\\n\")" ] }, { @@ -181,6 +183,7 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])" ] }, @@ -219,8 +222,8 @@ " arr[i, j] = random.randint(1, 10)\n", "\n", "print(arr)\n", - "print(np.mean(arr[:,2]))\n", - "print(np.median(arr[2,:]))\n", + "print(np.mean(arr[:, 2]))\n", + "print(np.median(arr[2, :]))\n", "print(arr[1:4, 1:4])" ] }, @@ -240,13 +243,13 @@ "metadata": {}, "outputs": [], "source": [ - "median = np.median(arr[2,:])\n", + "median = np.median(arr[2, :])\n", "print(f\"Median: {median}\")\n", "arr_mask = arr < median\n", "print(arr_mask)\n", "\n", "print(\"\\nsquare of unmasked values:\")\n", - "arr_sq = np.where(arr_mask, arr ** 2, arr)\n", + "arr_sq = np.where(arr_mask, arr**2, arr)\n", "print(arr_sq)" ] }, @@ -271,10 +274,11 @@ "outputs": [], "source": [ "import pandas as pd\n", - "df = pd.read_csv('../data/iris.data', na_values='*')\n", + "\n", + "df = pd.read_csv(\"../data/iris.data\", na_values=\"*\")\n", "print(df.loc[:4, :])\n", - "print('\\nColumn names:', df.columns)\n", - "print('\\nSummary:\\n', df.describe())" + "print(\"\\nColumn names:\", df.columns)\n", + "print(\"\\nSummary:\\n\", df.describe())" ] }, { @@ -292,7 +296,7 @@ "metadata": {}, "outputs": [], "source": [ - "subset = df.loc[(df['septal_length'] >= 5.0) & (df['petal_width'] <= 1.5), :]\n", + "subset = df.loc[(df[\"septal_length\"] >= 5.0) & (df[\"petal_width\"] <= 1.5), :]\n", "print(subset)" ] }, @@ -311,7 +315,7 @@ "metadata": {}, "outputs": [], "source": [ - "subset['combined'] = subset['septal_length'] * subset['petal_width']\n", + "subset[\"combined\"] = subset[\"septal_length\"] * subset[\"petal_width\"]\n", "print(subset.head())" ] }, @@ -330,7 +334,7 @@ "metadata": {}, "outputs": [], "source": [ - "subset.groupby('species')['septal_width'].mean()" + "subset.groupby(\"species\")[\"septal_width\"].mean()" ] }, { @@ -350,7 +354,7 @@ "metadata": {}, "outputs": [], "source": [ - "pd.melt(subset, id_vars=['species'], var_name='measurement', value_name='value')" + "pd.melt(subset, id_vars=[\"species\"], var_name=\"measurement\", value_name=\"value\")" ] } ], From 4af3feaa0222c16578a62f8e46a6181ed21ef248 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Thu, 23 Apr 2026 09:52:07 +0200 Subject: [PATCH 3/5] feat: set up pixi for version and task management --- .gitattributes | 2 + .gitignore | 3 + pixi.lock | 639 +++++++++++++++++++++++++++++++++++++++++++++++++ pixi.toml | 14 ++ 4 files changed, 658 insertions(+) create mode 100644 .gitattributes create mode 100644 pixi.lock create mode 100644 pixi.toml diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..997504b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# SCM syntax highlighting & preventing 3-way merges +pixi.lock merge=binary linguist-language=YAML linguist-generated=true -diff diff --git a/.gitignore b/.gitignore index ca05c86..fcc3e19 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ exercises/* templates/*.html templates/*_files +# pixi environments +.pixi/* +!.pixi/config.toml diff --git a/pixi.lock b/pixi.lock new file mode 100644 index 0000000..899310d --- /dev/null +++ b/pixi.lock @@ -0,0 +1,639 @@ +version: 6 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + - url: https://conda.anaconda.org/bioconda/ + options: + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/autopep8-2.3.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/black-26.3.1-pyh866005b_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.2-pyhc90fa1f_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/executing-2.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.12.0-pyhecfbec7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ipython_pygments_lexers-1.1.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jedi-0.19.2-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nbqa-1.9.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.1-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-1.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.52-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycodestyle-2.14.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pytokens-0.4.1-py313h54dd161_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/stack_data-0.6.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tokenize-rt-6.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + build_number: 20 + sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 + md5: a9f577daf3de00bca7c3c76c0ecbd1de + depends: + - __glibc >=2.17,<3.0.a0 + - libgomp >=7.5.0 + constrains: + - openmp_impl <0.0a0 + license: BSD-3-Clause + license_family: BSD + size: 28948 + timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda + sha256: ee4da0f3fe9d59439798ee399ef3e482791e48784873d546e706d0935f9ff010 + md5: 9673a61a297b00016442e022d689faa6 + depends: + - python >=3.10 + constrains: + - astroid >=2,<5 + license: Apache-2.0 + license_family: Apache + size: 28797 + timestamp: 1763410017955 +- conda: https://conda.anaconda.org/conda-forge/noarch/autopep8-2.3.2-pyhd8ed1ab_0.conda + sha256: 1dc8ba2892c76c7bdd6518e3684b88710f4a985ebfc1d4f588478569391d300b + md5: 08ee18d78273baa3ed4cef5a8a58d79a + depends: + - packaging + - pycodestyle >=2.12.0 + - python >=3.9 + - tomli + license: MIT + license_family: MIT + size: 46233 + timestamp: 1736871757804 +- conda: https://conda.anaconda.org/conda-forge/noarch/black-26.3.1-pyh866005b_0.conda + sha256: 671b78df3fd288e4c99762d9a1b0391b70be2c7a46df564d6e6b3862db2ec799 + md5: c7e43448266209d766a229cada982884 + depends: + - click >=8.0.0 + - mypy_extensions >=0.4.3 + - packaging >=22.0 + - pathspec >=0.9 + - platformdirs >=2 + - python >=3.11 + - pytokens >=0.4 + license: MIT + license_family: MIT + size: 171751 + timestamp: 1773315364851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 + md5: d2ffd7602c02f2b316fd921d39876885 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + size: 260182 + timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + sha256: c9dbcc8039a52023660d6d1bbf87594a93dd69c6ac5a2a44323af2c92976728d + md5: e18ad67cf881dcadee8b8d9e2f8e5f73 + depends: + - __unix + license: ISC + size: 131039 + timestamp: 1776865545798 +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.2-pyhc90fa1f_0.conda + sha256: 526d434cf5390310f40f34ea6ec4f0c225cdf1e419010e624d399b13b2059f0f + md5: 4d18bc3af7cfcea97bd817164672a08c + depends: + - __unix + - python + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + size: 98253 + timestamp: 1775578217828 +- conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda + sha256: c17c6b9937c08ad63cb20a26f403a3234088e57d4455600974a0ce865cb14017 + md5: 9ce473d1d1be1cc3810856a48b3fab32 + depends: + - python >=3.9 + license: BSD-2-Clause + license_family: BSD + size: 14129 + timestamp: 1740385067843 +- conda: https://conda.anaconda.org/conda-forge/noarch/executing-2.2.1-pyhd8ed1ab_0.conda + sha256: 210c8165a58fdbf16e626aac93cc4c14dbd551a01d1516be5ecad795d2422cad + md5: ff9efb7f7469aed3c4a8106ffa29593c + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 30753 + timestamp: 1756729456476 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + sha256: fbf86c4a59c2ed05bbffb2ba25c7ed94f6185ec30ecb691615d42342baa1a16a + md5: c80d8a3b84358cb967fa81e7075fbc8a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + size: 12723451 + timestamp: 1773822285671 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda + sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 + md5: 080594bf4493e6bae2607e65390c520a + depends: + - python >=3.10 + - zipp >=3.20 + - python + license: Apache-2.0 + license_family: APACHE + size: 34387 + timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.12.0-pyhecfbec7_0.conda + sha256: 932044bd893f7adce6c9b384b96a72fd3804cc381e76789398c2fae900f21df7 + md5: b293210beb192c3024683bf6a998a0b8 + depends: + - __unix + - decorator >=5.1.0 + - ipython_pygments_lexers >=1.0.0 + - jedi >=0.18.2 + - matplotlib-inline >=0.1.6 + - prompt-toolkit >=3.0.41,<3.1.0 + - pygments >=2.14.0 + - python >=3.12 + - stack_data >=0.6.0 + - traitlets >=5.13.0 + - pexpect >4.6 + - python + license: BSD-3-Clause + license_family: BSD + size: 649967 + timestamp: 1774609994657 +- conda: https://conda.anaconda.org/conda-forge/noarch/ipython_pygments_lexers-1.1.1-pyhd8ed1ab_0.conda + sha256: 894682a42a7d659ae12878dbcb274516a7031bbea9104e92f8e88c1f2765a104 + md5: bd80ba060603cc228d9d81c257093119 + depends: + - pygments + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + size: 13993 + timestamp: 1737123723464 +- conda: https://conda.anaconda.org/conda-forge/noarch/jedi-0.19.2-pyhd8ed1ab_1.conda + sha256: 92c4d217e2dc68983f724aa983cca5464dcb929c566627b26a2511159667dba8 + md5: a4f4c5dc9b80bc50e0d3dc4e6e8f1bd9 + depends: + - parso >=0.8.3,<0.9.0 + - python >=3.9 + license: Apache-2.0 AND MIT + size: 843646 + timestamp: 1733300981994 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c + md5: 18335a698559cdbcd86150a48bf54ba6 + depends: + - __glibc >=2.17,<3.0.a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-64 2.45.1 + license: GPL-3.0-only + license_family: GPL + size: 728002 + timestamp: 1774197446916 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda + sha256: e8c2b57f6aacabdf2f1b0924bd4831ce5071ba080baa4a9e8c0d720588b6794c + md5: 49f570f3bc4c874a06ea69b7225753af + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.7.5.* + license: MIT + license_family: MIT + size: 76624 + timestamp: 1774719175983 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 + md5: a360c33a5abe61c07959e449fa1453eb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + size: 58592 + timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 + md5: 0aa00f03f9e39fb9876085dee11a85d4 + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_18 + - libgomp 15.2.0 he0feb66_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 1041788 + timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 + md5: 239c5e9546c38a1e884d69effcf4c882 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 603262 + timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + sha256: ec30e52a3c1bf7d0425380a189d209a52baa03f22fb66dd3eb587acaa765bd6d + md5: b88d90cad08e6bc8ad540cb310a761fb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + size: 113478 + timestamp: 1775825492909 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 + md5: 2c21e66f50753a083cbe6b80f38268fa + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + size: 92400 + timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda + sha256: ec37c79f737933bbac965f5dc0f08ef2790247129a84bb3114fad4900adce401 + md5: 810d83373448da85c3f673fbcb7ad3a3 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + size: 958864 + timestamp: 1775753750179 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e + md5: 1b08cd684f34175e4514474793d44bcb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc 15.2.0 he0feb66_18 + constrains: + - libstdcxx-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 5852330 + timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + sha256: bc1b08c92626c91500fd9f26f2c797f3eb153b627d53e9c13cd167f1e12b2829 + md5: 38ffe67b78c9d4de527be8315e5ada2c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + size: 40297 + timestamp: 1775052476770 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9 + md5: d87ff7921124eccd67248aa483c23fec + depends: + - __glibc >=2.17,<3.0.a0 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + size: 63629 + timestamp: 1774072609062 +- conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.2.1-pyhd8ed1ab_0.conda + sha256: 9d690334de0cd1d22c51bc28420663f4277cfa60d34fa5cad1ce284a13f1d603 + md5: 00e120ce3e40bad7bfc78861ce3c4a25 + depends: + - python >=3.10 + - traitlets + license: BSD-3-Clause + license_family: BSD + size: 15175 + timestamp: 1761214578417 +- conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + sha256: 6ed158e4e5dd8f6a10ad9e525631e35cee8557718f83de7a4e3966b1f772c4b1 + md5: e9c622e0d00fa24a6292279af3ab6d06 + depends: + - python >=3.9 + license: MIT + license_family: MIT + size: 11766 + timestamp: 1745776666688 +- conda: https://conda.anaconda.org/conda-forge/noarch/nbqa-1.9.0-pyhd8ed1ab_0.conda + sha256: cf2323ebaf70dd55bda9292b008975c134896091c550d881eca4cb5669b09afd + md5: 44f74c1a5386ea4d95a0f34314f68517 + depends: + - autopep8 >=1.5 + - importlib-metadata >=1.4 + - ipython >=7.8.0 + - python >=3.8.0 + - tokenize-rt >=3.2.0 + - tomli + license: MIT + license_family: MIT + size: 34693 + timestamp: 1724794010943 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 + md5: 47e340acb35de30501a76c7c799c41d7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: X11 AND BSD-3-Clause + size: 891641 + timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + sha256: c0ef482280e38c71a08ad6d71448194b719630345b0c9c60744a2010e8a8e0cb + md5: da1b85b6a87e141f5140bb9924cecab0 + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + size: 3167099 + timestamp: 1775587756857 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.1-pyhc364b38_0.conda + sha256: 171d977bc977fd80f2a05de3d4b7d571c4ec3cdea436ed364e5cd50547c50881 + md5: b8ae38639d323d808da535fb71e31be8 + depends: + - python >=3.8 + - python + license: Apache-2.0 + license_family: APACHE + size: 89360 + timestamp: 1776209387231 +- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda + sha256: 42b2d77ccea60752f3aa929a6413a7835aaacdbbde679f2f5870a744fa836b94 + md5: 97c1ce2fffa1209e7afb432810ec6e12 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 82287 + timestamp: 1770676243987 +- conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-1.0.4-pyhd8ed1ab_0.conda + sha256: 29ea20d0faf20374fcd61c25f6d32fb8e9a2c786a7f1473a0c3ead359470fbe1 + md5: 2908273ac396d2cd210a8127f5f1c0d6 + depends: + - python >=3.10 + license: MPL-2.0 + license_family: MOZILLA + size: 53739 + timestamp: 1769677743677 +- conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda + sha256: 202af1de83b585d36445dc1fda94266697341994d1a3328fabde4989e1b3d07a + md5: d0d408b1f18883a944376da5cf8101ea + depends: + - ptyprocess >=0.5 + - python >=3.9 + license: ISC + size: 53561 + timestamp: 1733302019362 +- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda + sha256: 8f29915c172f1f7f4f7c9391cd5dac3ebf5d13745c8b7c8006032615246345a5 + md5: 89c0b6d1793601a2a3a3f7d2d3d8b937 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 25862 + timestamp: 1775741140609 +- conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.52-pyha770c72_0.conda + sha256: 4817651a276016f3838957bfdf963386438c70761e9faec7749d411635979bae + md5: edb16f14d920fb3faf17f5ce582942d6 + depends: + - python >=3.10 + - wcwidth + constrains: + - prompt_toolkit 3.0.52 + license: BSD-3-Clause + license_family: BSD + size: 273927 + timestamp: 1756321848365 +- conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda + sha256: a7713dfe30faf17508ec359e0bc7e0983f5d94682492469bd462cdaae9c64d83 + md5: 7d9daffbb8d8e0af0f769dbbcd173a54 + depends: + - python >=3.9 + license: ISC + size: 19457 + timestamp: 1733302371990 +- conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda + sha256: 71bd24600d14bb171a6321d523486f6a06f855e75e547fa0cb2a0953b02047f0 + md5: 3bfdfb8dbcdc4af1ae3f9a8eb3948f04 + depends: + - python >=3.9 + license: MIT + license_family: MIT + size: 16668 + timestamp: 1733569518868 +- conda: https://conda.anaconda.org/conda-forge/noarch/pycodestyle-2.14.0-pyhd8ed1ab_0.conda + sha256: 1950f71ff44e64163e176b1ca34812afc1a104075c3190de50597e1623eb7d53 + md5: 85815c6a22905c080111ec8d56741454 + depends: + - python >=3.9 + license: MIT + license_family: MIT + size: 35182 + timestamp: 1750616054854 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda + sha256: cf70b2f5ad9ae472b71235e5c8a736c9316df3705746de419b59d442e8348e86 + md5: 16c18772b340887160c79a6acc022db0 + depends: + - python >=3.10 + license: BSD-2-Clause + license_family: BSD + size: 893031 + timestamp: 1774796815820 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.13-h6add32d_100_cp313.conda + build_number: 100 + sha256: 7f77eb57648f545c1f58e10035d0d9d66b0a0efb7c4b58d3ed89ec7269afdde1 + md5: 05051be49267378d2fcd12931e319ac3 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libuuid >=2.42,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + size: 37358322 + timestamp: 1775614712638 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + build_number: 8 + sha256: 210bffe7b121e651419cb196a2a63687b087497595c9be9d20ebe97dd06060a7 + md5: 94305520c52a4aa3f6c2b1ff6008d9f8 + constrains: + - python 3.13.* *_cp313 + license: BSD-3-Clause + license_family: BSD + size: 7002 + timestamp: 1752805902938 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pytokens-0.4.1-py313h54dd161_1.conda + sha256: 543302099bbe6b2e77e8a43894dc3894a0bf47e18ea1b0b21ade196f0bdf1ce7 + md5: 8aafbc11caed472c9f7a174f9925fb94 + depends: + - python + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - python_abi 3.13.* *_cp313 + license: MIT + license_family: MIT + size: 277555 + timestamp: 1771613648731 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 + md5: d7d95fc8287ea7bf33e0e7116d2b95ec + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + size: 345073 + timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/noarch/stack_data-0.6.3-pyhd8ed1ab_1.conda + sha256: 570da295d421661af487f1595045760526964f41471021056e993e73089e9c41 + md5: b1b505328da7a6b246787df4b5a49fbc + depends: + - asttokens + - executing + - pure_eval + - python >=3.9 + license: MIT + license_family: MIT + size: 26988 + timestamp: 1733569565672 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac + md5: cffd3bdd58090148f4cfcd831f4b26ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + size: 3301196 + timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/tokenize-rt-6.2.0-pyhd8ed1ab_0.conda + sha256: b8da0c728e1313e116a06084ea770c6ad752b9cd086d52b20fcd464bdce52e4b + md5: 0a42378794e0425eb5defc9d63e60607 + depends: + - python >=3.9 + license: MIT + license_family: MIT + size: 12383 + timestamp: 1748092106333 +- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.1-pyhcf101f3_0.conda + sha256: 91cafdb64268e43e0e10d30bd1bef5af392e69f00edd34dfaf909f69ab2da6bd + md5: b5325cf06a000c5b14970462ff5e4d58 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 21561 + timestamp: 1774492402955 +- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda + sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959 + md5: 019a7385be9af33791c989871317e1ed + depends: + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + size: 110051 + timestamp: 1733367480074 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c + md5: ad659d0a2b3e47e38d829aa8cad2d610 + license: LicenseRef-Public-Domain + size: 119135 + timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.6.0-pyhd8ed1ab_0.conda + sha256: e298b508b2473c4227206800dfb14c39e4b14fd79d4636132e9e1e4244cdf4aa + md5: c3197f8c0d5b955c904616b716aca093 + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 71550 + timestamp: 1770634638503 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda + sha256: 523616c0530d305d2216c2b4a8dfd3872628b60083255b89c5e0d8c42e738cca + md5: e1c36c6121a7c9c76f2f148f1e83b983 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 24461 + timestamp: 1776131454755 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 + md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 + depends: + - __glibc >=2.17,<3.0.a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + size: 601375 + timestamp: 1764777111296 diff --git a/pixi.toml b/pixi.toml new file mode 100644 index 0000000..dd6afc9 --- /dev/null +++ b/pixi.toml @@ -0,0 +1,14 @@ +[workspace] +authors = ["m-jahn "] +channels = ["conda-forge", "bioconda"] +name = "coding-for-biologists" +platforms = ["linux-64"] +version = "0.1.0" + +[tasks] +format-nbs = { cmd = ["bash", "-lc", "find . -name '*.ipynb' -exec nbqa black {} +"] } +check-nbs = { cmd = ["bash", "-lc", "find . -name '*.ipynb' -exec nbqa black --check --diff {} +"] } + +[dependencies] +nbqa = ">=1.9.0,<2" +black = ">=26.3.1,<27" From adcb2124560dedd95d49cd1b921a6fd309011ca0 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Thu, 23 Apr 2026 09:54:44 +0200 Subject: [PATCH 4/5] feat: added actions workflow to check correct formatting of notebooks --- .github/workflows/formatting.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/workflows/formatting.yml diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml new file mode 100644 index 0000000..fe98652 --- /dev/null +++ b/.github/workflows/formatting.yml @@ -0,0 +1,18 @@ +name: Formatting + +on: + push: + branches: [main] + +jobs: + formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - uses: prefix-dev/setup-pixi@v0 + with: + cache: true + + - name: formatting + run: pixi run check-nbs From fb6c47688e05706b774f20304268504064becd18 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Thu, 23 Apr 2026 09:59:43 +0200 Subject: [PATCH 5/5] fix: gh action trigger --- .github/workflows/formatting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index fe98652..372b336 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -1,7 +1,7 @@ name: Formatting on: - push: + pull_request: branches: [main] jobs: