Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Biberplus is a pure Python implementation of the linguistic tagging system intro

- [Features](#features)
- [Installation](#installation)
- [Optional extras](#optional-extras)
- [Quickstart Guide](#quickstart-guide)
- [Biber Tagger](#1-biber-tagger)
- [Function Words Tagger](#2-function-words-tagger)
Expand All @@ -36,29 +37,51 @@ Biberplus is a pure Python implementation of the linguistic tagging system intro

## Installation

### From PyPI (Stable Release)
Install the latest version (0.3.0) from PyPI:

```bash
pip install biberplus
```

For more details and package history, visit the [Biberplus project page on PyPI](https://pypi.org/project/biberplus/0.3.0/).
Requires:

- Python 3.10+
- pandas, NumPy, spaCy, tqdm, PyYAML, factor_analyzer.
- a spaCy English language model

**Important:**
Biberplus depends on spaCy for text processing. After installing biberplus, you must manually download the spaCy English model by running:
After installation, download a spaCy model with:

```bash
python -m spacy download en_core_web_sm
```

### Optional extras

For PCA/factor analysis plotting functions:

```bash
pip install biberplus[plots]
```

For development (ruff, flake8):

```bash
pip install biberplus[dev]
```

Then run linting with:

```bash
ruff check .
flake8 .
```

---

## Quickstart Guide

### 1. Biber Tagger

**Tag a string using the default configuration:**

```python
from biberplus.tagger import calculate_tag_frequencies

Expand All @@ -67,6 +90,7 @@ print(frequencies_df)
```

**Tag a large corpus with GPU and multi-processing:**

```python
from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies

Expand All @@ -80,6 +104,7 @@ print(frequencies_df)
### 2. Function Words Tagger

**Using the default list:**

```python
from biberplus.tagger import load_config, calculate_tag_frequencies

Expand All @@ -90,6 +115,7 @@ print(frequencies_df)
```

**Using a custom list:**

```python
from biberplus.tagger import load_config, calculate_tag_frequencies

Expand All @@ -108,6 +134,7 @@ print(frequencies_df)
### 3. Word-Level Tagging

See exactly which tags are applied to each word:

```python
import spacy
from biberplus.tagger import tag_text, load_config, load_pipeline
Expand All @@ -128,6 +155,7 @@ for word in tagged_words:
```

Example output:

```
Word: It Tags: it, PIT, CAP, PRP, SBJP
Word: does Tags: VPRT, SPAU
Expand All @@ -139,6 +167,7 @@ Word: likely Tags: JJ
### 4. Text Embeddings

Generate an embedding vector from the textual data:

```python
from biberplus.tagger import load_config
from biberplus.reducer import encode_text
Expand All @@ -151,6 +180,7 @@ print(embedding)
### 5. Dimension Reduction

**Using PCA:**

```python
from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies
from biberplus.reducer import tags_pca
Expand Down
6 changes: 3 additions & 3 deletions biberplus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
__version__ = "0.1.0"
__author__ = 'Kenan Alkiek'
__credits__ = 'University of Michigan - The Blablablab'
__author__ = "Kenan Alkiek"
__credits__ = "University of Michigan - The Blablablab"

from . import tagger
from . import reducer

__all__ = ['tagger', 'reducer']
__all__ = ["tagger", "reducer"]
2 changes: 1 addition & 1 deletion biberplus/reducer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .pca_reducer import tags_pca
from .text_encoder import encode_text

__all__ = ['tags_pca', 'encode_text']
__all__ = ["tags_pca", "encode_text"]
8 changes: 4 additions & 4 deletions biberplus/reducer/factor_reducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
from factor_analyzer import FactorAnalyzer


def tags_factor_analysis(frequencies_df, n_factors=3, name=None, rotation='promax'):
def tags_factor_analysis(frequencies_df, n_factors=3, name=None, rotation="promax"):
fa = FactorAnalyzer(rotation=rotation, n_factors=n_factors)
X = frequencies_df.drop('tag', axis=1).values
X = frequencies_df.drop("tag", axis=1).values
fa.fit(X)

columns = ['Factor{}'.format(i + 1) for i in range(n_factors)]
columns = ["Factor{}".format(i + 1) for i in range(n_factors)]
loadings_df = pd.DataFrame(data=fa.loadings_, columns=columns)

if name:
loadings_df['name'] = name
loadings_df["name"] = name

return loadings_df, fa
75 changes: 41 additions & 34 deletions biberplus/reducer/pca_reducer.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,76 @@
import matplotlib.pyplot as plt
from __future__ import annotations

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA

try:
import matplotlib.pyplot as plt
import seaborn as sns
_HAS_PLOTTING = True
except ImportError:
_HAS_PLOTTING = False


def _require_plotting():
if not _HAS_PLOTTING:
raise ImportError(
"Plotting requires matplotlib and seaborn. "
"Install with: pip install biberplus[plots]"
)


def tags_pca(frequencies_df, components=2, name=None):
pca = PCA(n_components=components)
X = frequencies_df.drop('tag', axis=1).values
X = frequencies_df.drop("tag", axis=1).values
pca_features = pca.fit_transform(X)

columns = ['PC{}'.format(i + 1) for i in range(components)]
columns = ["PC{}".format(i + 1) for i in range(components)]

pca_df = pd.DataFrame(data=pca_features, columns=columns)
if name:
pca_df['name'] = name
pca_df["name"] = name
return pca_df, pca.explained_variance_


def plot_pca_2d(df):
_require_plotting()
sns.set()
sns.lmplot(
x='PC1',
y='PC2',
data=df,
hue='name',
fit_reg=False,
legend=True
)
plt.title('2D PCA Graph')
sns.lmplot(x="PC1", y="PC2", data=df, hue="name", fit_reg=False, legend=True)
plt.title("2D PCA Graph")
plt.show()


def visualize_explained_variance(explained_variance):
plt.bar(
range(1, len(explained_variance) + 1),
explained_variance
)

plt.xlabel('PCA Feature')
plt.ylabel('Explained variance')
plt.title('Feature Explained Variance')
_require_plotting()
plt.bar(range(1, len(explained_variance) + 1), explained_variance)
plt.xlabel("PCA Feature")
plt.ylabel("Explained variance")
plt.title("Feature Explained Variance")
plt.show()


def plot_explained_variance(pca):
_require_plotting()
n_components = pca.n_components_
plt.figure(figsize=(10, 5))
plt.bar(range(n_components), pca.explained_variance_, align='center')
plt.xticks(range(n_components), ['PC{}'.format(i + 1) for i in range(n_components)])
plt.ylabel('Explained Variance')
plt.xlabel('Principal Components')
plt.title('Explained Variance of PCA Components')
plt.bar(range(n_components), pca.explained_variance_, align="center")
plt.xticks(range(n_components), ["PC{}".format(i + 1) for i in range(n_components)])
plt.ylabel("Explained Variance")
plt.xlabel("Principal Components")
plt.title("Explained Variance of PCA Components")
plt.show()


def plot_cumulative_explained_variance(pca):
_require_plotting()
n_components = pca.n_components_
explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 5))
plt.plot(range(n_components), explained_variance_ratio_cumsum, marker='o')
plt.xticks(range(n_components), ['PC{}'.format(i + 1) for i in range(n_components)])
plt.ylabel('Cumulative Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.title('Cumulative Explained Variance Ratio of PCA Components')
plt.plot(range(n_components), explained_variance_ratio_cumsum, marker="o")
plt.xticks(range(n_components), ["PC{}".format(i + 1) for i in range(n_components)])
plt.ylabel("Cumulative Explained Variance Ratio")
plt.xlabel("Principal Components")
plt.title("Cumulative Explained Variance Ratio of PCA Components")
plt.grid(True)
plt.show()
plt.show()
34 changes: 25 additions & 9 deletions biberplus/reducer/text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,34 @@ def encode_text(config, text, round_to=10):
encodings = {}

biber_tags = BIBER_PLUS_TAGS + DOC_TAGS
binary_tags = ['BIN_' + tag for tag in biber_tags]
binary_tags = ["BIN_" + tag for tag in biber_tags]

# Split the counts by type
if config['binary_tags']:
binary_frequencies = frequencies_df[frequencies_df['tag'].isin(binary_tags)]
encodings['binary'] = binary_frequencies[['mean', 'std']].to_numpy().flatten().round(round_to).tolist()
if config["binary_tags"]:
binary_frequencies = frequencies_df[frequencies_df["tag"].isin(binary_tags)]
encodings["binary"] = (
binary_frequencies[["mean", "std"]]
.to_numpy()
.flatten()
.round(round_to)
.tolist()
)

if config['function_words']:
fw_frequencies = frequencies_df[~frequencies_df['tag'].isin(biber_tags + binary_tags)]
encodings['function_words'] = fw_frequencies.drop('tag', axis=1).to_numpy().flatten().round(round_to).tolist()
if config["function_words"]:
fw_frequencies = frequencies_df[
~frequencies_df["tag"].isin(biber_tags + binary_tags)
]
encodings["function_words"] = (
fw_frequencies.drop("tag", axis=1)
.to_numpy()
.flatten()
.round(round_to)
.tolist()
)

frequencies_df = frequencies_df[frequencies_df['tag'].isin(biber_tags)]
encodings['biber'] = frequencies_df.drop('tag', axis=1).to_numpy().flatten().round(round_to).tolist()
frequencies_df = frequencies_df[frequencies_df["tag"].isin(biber_tags)]
encodings["biber"] = (
frequencies_df.drop("tag", axis=1).to_numpy().flatten().round(round_to).tolist()
)

return encodings
10 changes: 8 additions & 2 deletions biberplus/tagger/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,11 @@
from .tagger import load_config, load_pipeline, tag_text
from .tag_frequencies import calculate_tag_frequencies

__all__ = ['load_config', 'load_pipeline', 'tag_text', 'calculate_tag_frequencies', 'BiberPlusTagger',
'FunctionWordsTagger']
__all__ = [
"load_config",
"load_pipeline",
"tag_text",
"calculate_tag_frequencies",
"BiberPlusTagger",
"FunctionWordsTagger",
]
Loading