Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 0 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,13 @@ end
### Load a Dataset from Hugging Face

```elixir
# Load the IMDB dataset
{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"})

# Load a specific split
{:ok, train_data} = ElixirDatasets.load_dataset(
{:hf, "imdb"},
split: "train"
)

# Load a specific configuration
{:ok, dataset} = ElixirDatasets.load_dataset(
{:hf, "glue"},
name: "sst2",
Expand All @@ -55,14 +52,12 @@ end
### Stream Large Datasets

```elixir
# Stream data without loading everything into memory
{:ok, stream} = ElixirDatasets.load_dataset(
{:hf, "c4"},
split: "train",
streaming: true
)

# Process first 1000 rows
stream
|> Enum.take(1000)
|> Enum.each(&process_row/1)
Expand All @@ -71,7 +66,6 @@ stream
### Parallel Loading for Performance

```elixir
# Use all CPU cores for faster loading
{:ok, dataset} = ElixirDatasets.load_dataset(
{:hf, "multi-file-dataset"},
num_proc: System.schedulers_online()
Expand All @@ -81,14 +75,12 @@ stream
### Upload Your Own Dataset

```elixir
# Create a DataFrame
df = Explorer.DataFrame.new(%{
id: [1, 2, 3],
text: ["Hello", "World", "!"],
label: [0, 1, 0]
})

# Upload to Hugging Face
{:ok, _response} = ElixirDatasets.upload_dataset(
df,
"username/my-dataset",
Expand All @@ -101,7 +93,6 @@ df = Explorer.DataFrame.new(%{
### Work with Local Files

```elixir
# Load from local directory
{:ok, dataset} = ElixirDatasets.load_dataset(
{:local, "./data"},
split: "train"
Expand All @@ -113,20 +104,16 @@ df = Explorer.DataFrame.new(%{
### Example 1: Text Classification with GLUE

```elixir
# Load SST-2 sentiment classification dataset
{:ok, train} = ElixirDatasets.load_dataset(
{:hf, "glue"},
name: "sst2",
split: "train"
)

# Explore the data
IO.inspect(Explorer.DataFrame.head(train, 5))

# Filter positive examples
positive = Explorer.DataFrame.filter(train, label == 1)

# Get statistics
stats = Explorer.DataFrame.summarise(train,
total: count(label),
positive: sum(label)
Expand All @@ -136,19 +123,16 @@ stats = Explorer.DataFrame.summarise(train,
### Example 2: Streaming Large Dataset

```elixir
# Stream Wikipedia dataset
{:ok, stream} = ElixirDatasets.load_dataset(
{:hf, "wikipedia"},
name: "20220301.en",
split: "train",
streaming: true
)

# Process in batches
stream
|> Stream.chunk_every(100)
|> Stream.each(fn batch ->
# Process batch
batch |> Enum.each(&analyze_text/1)
end)
|> Stream.run()
Expand All @@ -157,10 +141,8 @@ end)
### Example 3: Offline Mode

```elixir
# First, download the dataset
{:ok, _} = ElixirDatasets.load_dataset({:hf, "imdb"})

# Later, work offline
System.put_env("ELIXIR_DATASETS_OFFLINE", "1")

{:ok, dataset} = ElixirDatasets.load_dataset(
Expand All @@ -180,16 +162,13 @@ System.put_env("ELIXIR_DATASETS_OFFLINE", "1")
### Cache Management

```elixir
# Get cache directory
cache_dir = ElixirDatasets.cache_dir()

# Force redownload
{:ok, dataset} = ElixirDatasets.load_dataset(
{:hf, "dataset_name"},
download_mode: :force_redownload
)

# Skip verification for faster loading
{:ok, dataset} = ElixirDatasets.load_dataset(
{:hf, "dataset_name"},
verification_mode: :no_checks
Expand Down Expand Up @@ -239,17 +218,14 @@ ElixirDatasets focuses on core dataset loading and management features:
### Axon (Neural Networks)

```elixir
# Load dataset
{:ok, train} = ElixirDatasets.load_dataset({:hf, "mnist"})

# Convert to Nx tensors for Axon
train_tensors = train
|> Explorer.DataFrame.to_rows()
|> Enum.map(fn row ->
{Nx.tensor(row["image"]), Nx.tensor(row["label"])}
end)

# Train with Axon
model = Axon.input("input", shape: {nil, 784})
|> Axon.dense(128, activation: :relu)
|> Axon.dense(10, activation: :softmax)
Expand All @@ -258,25 +234,20 @@ model = Axon.input("input", shape: {nil, 784})
### Bumblebee (Transformers)

```elixir
# Load text dataset
{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train")

# Load Bumblebee model
{:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})

# Process dataset
texts = Explorer.DataFrame.pull(dataset, "text")
inputs = Bumblebee.apply_tokenizer(tokenizer, texts)
```

### Nx (Numerical Computing)

```elixir
# Load numerical dataset
{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "california_housing"})

# Convert to Nx tensors
features = dataset
|> Explorer.DataFrame.select(["feature1", "feature2", "feature3"])
|> Explorer.DataFrame.to_columns()
Expand All @@ -300,13 +271,10 @@ Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets)
## 🧪 Testing

```bash
# Run all tests
mix test

# Run with coverage
mix coveralls

# Run specific test file
mix test test/elixir_datasets_test.exs
```

Expand Down