diff --git a/README.md b/README.md index bc2bf21..b6a8057 100644 --- a/README.md +++ b/README.md @@ -35,16 +35,13 @@ end ### Load a Dataset from Hugging Face ```elixir -# Load the IMDB dataset {:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}) -# Load a specific split {:ok, train_data} = ElixirDatasets.load_dataset( {:hf, "imdb"}, split: "train" ) -# Load a specific configuration {:ok, dataset} = ElixirDatasets.load_dataset( {:hf, "glue"}, name: "sst2", @@ -55,14 +52,12 @@ end ### Stream Large Datasets ```elixir -# Stream data without loading everything into memory {:ok, stream} = ElixirDatasets.load_dataset( {:hf, "c4"}, split: "train", streaming: true ) -# Process first 1000 rows stream |> Enum.take(1000) |> Enum.each(&process_row/1) @@ -71,7 +66,6 @@ stream ### Parallel Loading for Performance ```elixir -# Use all CPU cores for faster loading {:ok, dataset} = ElixirDatasets.load_dataset( {:hf, "multi-file-dataset"}, num_proc: System.schedulers_online() @@ -81,14 +75,12 @@ stream ### Upload Your Own Dataset ```elixir -# Create a DataFrame df = Explorer.DataFrame.new(%{ id: [1, 2, 3], text: ["Hello", "World", "!"], label: [0, 1, 0] }) -# Upload to Hugging Face {:ok, _response} = ElixirDatasets.upload_dataset( df, "username/my-dataset", @@ -101,7 +93,6 @@ df = Explorer.DataFrame.new(%{ ### Work with Local Files ```elixir -# Load from local directory {:ok, dataset} = ElixirDatasets.load_dataset( {:local, "./data"}, split: "train" @@ -113,20 +104,16 @@ df = Explorer.DataFrame.new(%{ ### Example 1: Text Classification with GLUE ```elixir -# Load SST-2 sentiment classification dataset {:ok, train} = ElixirDatasets.load_dataset( {:hf, "glue"}, name: "sst2", split: "train" ) -# Explore the data IO.inspect(Explorer.DataFrame.head(train, 5)) -# Filter positive examples positive = Explorer.DataFrame.filter(train, label == 1) -# Get statistics stats = Explorer.DataFrame.summarise(train, total: count(label), positive: sum(label) @@ -136,7 +123,6 @@ stats = Explorer.DataFrame.summarise(train, ### Example 2: Streaming Large Dataset ```elixir -# Stream Wikipedia dataset {:ok, stream} = ElixirDatasets.load_dataset( {:hf, "wikipedia"}, name: "20220301.en", @@ -144,11 +130,9 @@ stats = Explorer.DataFrame.summarise(train, streaming: true ) -# Process in batches stream |> Stream.chunk_every(100) |> Stream.each(fn batch -> - # Process batch batch |> Enum.each(&analyze_text/1) end) |> Stream.run() @@ -157,10 +141,8 @@ end) ### Example 3: Offline Mode ```elixir -# First, download the dataset {:ok, _} = ElixirDatasets.load_dataset({:hf, "imdb"}) -# Later, work offline System.put_env("ELIXIR_DATASETS_OFFLINE", "1") {:ok, dataset} = ElixirDatasets.load_dataset( @@ -180,16 +162,13 @@ System.put_env("ELIXIR_DATASETS_OFFLINE", "1") ### Cache Management ```elixir -# Get cache directory cache_dir = ElixirDatasets.cache_dir() -# Force redownload {:ok, dataset} = ElixirDatasets.load_dataset( {:hf, "dataset_name"}, download_mode: :force_redownload ) -# Skip verification for faster loading {:ok, dataset} = ElixirDatasets.load_dataset( {:hf, "dataset_name"}, verification_mode: :no_checks @@ -239,17 +218,14 @@ ElixirDatasets focuses on core dataset loading and management features: ### Axon (Neural Networks) ```elixir -# Load dataset {:ok, train} = ElixirDatasets.load_dataset({:hf, "mnist"}) -# Convert to Nx tensors for Axon train_tensors = train |> Explorer.DataFrame.to_rows() |> Enum.map(fn row -> {Nx.tensor(row["image"]), Nx.tensor(row["label"])} end) -# Train with Axon model = Axon.input("input", shape: {nil, 784}) |> Axon.dense(128, activation: :relu) |> Axon.dense(10, activation: :softmax) @@ -258,14 +234,11 @@ model = Axon.input("input", shape: {nil, 784}) ### Bumblebee (Transformers) ```elixir -# Load text dataset {:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train") -# Load Bumblebee model {:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"}) {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"}) -# Process dataset texts = Explorer.DataFrame.pull(dataset, "text") inputs = Bumblebee.apply_tokenizer(tokenizer, texts) ``` @@ -273,10 +246,8 @@ inputs = Bumblebee.apply_tokenizer(tokenizer, texts) ### Nx (Numerical Computing) ```elixir -# Load numerical dataset {:ok, dataset} = ElixirDatasets.load_dataset({:hf, "california_housing"}) -# Convert to Nx tensors features = dataset |> Explorer.DataFrame.select(["feature1", "feature2", "feature3"]) |> Explorer.DataFrame.to_columns() @@ -300,13 +271,10 @@ Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets) ## 🧪 Testing ```bash -# Run all tests mix test -# Run with coverage mix coveralls -# Run specific test file mix test test/elixir_datasets_test.exs ```