diff --git a/README.md b/README.md
index b6a8057..868ebbd 100644
--- a/README.md
+++ b/README.md
@@ -4,18 +4,16 @@
[](https://hexdocs.pm/elixir_datasets)
[](https://opensource.org/licenses/MIT)
-**ElixirDatasets** is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir. Inspired by the Python `datasets` library, it brings powerful dataset management capabilities to the Elixir ecosystem with seamless integration with Explorer DataFrames.
+**ElixirDatasets** is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir. Inspired by the [Python `datasets` library](https://github.com/huggingface/datasets), it brings powerful dataset management capabilities to the Elixir ecosystem with seamless integration with Explorer DataFrames.
## β¨ Features
- π **Easy Access to Hugging Face Hub** - Load thousands of datasets with a single function call
- π **Explorer Integration** - Automatic conversion to Explorer DataFrames for data manipulation
-- β‘ **High Performance** - Parallel processing support for loading multiple files
- πΎ **Smart Caching** - Intelligent local caching to avoid redundant downloads
- π **Streaming Support** - Process large datasets without loading everything into memory
- π€ **Upload Datasets** - Publish your own datasets to Hugging Face Hub
- π **Private Repositories** - Full support for authentication and private datasets
-- π **Offline Mode** - Work with cached datasets without internet connection
- π― **Multiple Formats** - Support for CSV, Parquet, and JSONL files
## π¦ Installation
@@ -32,278 +30,56 @@ end
## π Quick Start
-### Load a Dataset from Hugging Face
-
```elixir
-{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"})
-
-{:ok, train_data} = ElixirDatasets.load_dataset(
- {:hf, "imdb"},
- split: "train"
-)
-
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:hf, "glue"},
- name: "sst2",
+{:ok, [train_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
split: "train"
)
-```
-### Stream Large Datasets
+{:ok, datasets} = ElixirDatasets.load_dataset({:local, "./data"})
-```elixir
{:ok, stream} = ElixirDatasets.load_dataset(
- {:hf, "c4"},
+ {:hf, "stanfordnlp/imdb", subdir: "plain_text"},
split: "train",
streaming: true
)
-stream
-|> Enum.take(1000)
-|> Enum.each(&process_row/1)
-```
-
-### Parallel Loading for Performance
-
-```elixir
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:hf, "multi-file-dataset"},
- num_proc: System.schedulers_online()
-)
-```
-
-### Upload Your Own Dataset
-
-```elixir
-df = Explorer.DataFrame.new(%{
- id: [1, 2, 3],
- text: ["Hello", "World", "!"],
- label: [0, 1, 0]
-})
-
-{:ok, _response} = ElixirDatasets.upload_dataset(
- df,
- "username/my-dataset",
- file_extension: "parquet",
- commit_message: "Initial upload",
- auth_token: System.get_env("HF_TOKEN")
-)
-```
-
-### Work with Local Files
-
-```elixir
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:local, "./data"},
- split: "train"
-)
+stream |> Enum.take(100) |> IO.inspect()
```
## π Examples
-### Example 1: Text Classification with GLUE
-
-```elixir
-{:ok, train} = ElixirDatasets.load_dataset(
- {:hf, "glue"},
- name: "sst2",
- split: "train"
-)
-
-IO.inspect(Explorer.DataFrame.head(train, 5))
-
-positive = Explorer.DataFrame.filter(train, label == 1)
-
-stats = Explorer.DataFrame.summarise(train,
- total: count(label),
- positive: sum(label)
-)
-```
-
-### Example 2: Streaming Large Dataset
-
-```elixir
-{:ok, stream} = ElixirDatasets.load_dataset(
- {:hf, "wikipedia"},
- name: "20220301.en",
- split: "train",
- streaming: true
-)
-
-stream
-|> Stream.chunk_every(100)
-|> Stream.each(fn batch ->
- batch |> Enum.each(&analyze_text/1)
-end)
-|> Stream.run()
-```
-
-### Example 3: Offline Mode
-
-```elixir
-{:ok, _} = ElixirDatasets.load_dataset({:hf, "imdb"})
-
-System.put_env("ELIXIR_DATASETS_OFFLINE", "1")
-
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:hf, "imdb"},
- download_mode: :reuse_dataset_if_exists
-)
-```
+All examples can be found in the [examples](examples) directory.
+- `examples/usage_examples.livemd` - Comprehensive usage examples of the elixir_datasets api
+- `examples/integration_examples.livemd` - Examples demonstrating integration with other Elixir libraries like [Nx](https://github.com/elixir-nx/nx), [Axon](https://github.com/elixir-nx/axon), and [Bumblebee](https://github.com/elixir-nx/bumblebee)
## π§ Configuration
### Environment Variables
-- `ELIXIR_DATASETS_CACHE_DIR` - Custom cache directory (default: system cache)
+- `ELIXIR_DATASETS_CACHE_DIR` - Custom cache directory
- `ELIXIR_DATASETS_OFFLINE` - Enable offline mode (`"1"` or `"true"`)
-- `HUGGING_FACE_HUB_TOKEN` - Authentication token for private datasets
-
-### Cache Management
-
-```elixir
-cache_dir = ElixirDatasets.cache_dir()
-
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:hf, "dataset_name"},
- download_mode: :force_redownload
-)
-
-{:ok, dataset} = ElixirDatasets.load_dataset(
- {:hf, "dataset_name"},
- verification_mode: :no_checks
-)
-```
-
-## π Comparison with Python `datasets`
-
-| Feature | ElixirDatasets | Python `datasets` |
-|---------|----------------|-------------------|
-| Load from Hugging Face Hub | β
| β
|
-| Streaming | β
| β
|
-| Caching | β
| β
|
-| Parallel Processing | β
| β
|
-| Upload to Hub | β
| β
|
-| Multiple Formats (CSV, Parquet, JSONL) | β
| β
|
-| Offline Mode | β
| β
|
-| Private Datasets | β
| β
|
-| DataFrame Integration | β
(Explorer) | β
(Pandas/Polars) |
-| Map/Filter Operations | β οΈ (via Explorer) | β
|
-| Custom Dataset Scripts | β | β
|
-| Audio/Image Processing | β | β
|
-| Metrics | β | β
|
-
-**Legend:** β
Fully Supported | β οΈ Partial Support | β Not Supported
-
-### What's Supported
-
-ElixirDatasets focuses on core dataset loading and management features:
-- β
Loading datasets from Hugging Face Hub
-- β
Streaming for large datasets
-- β
Parallel processing with `num_proc`
-- β
Smart caching and offline mode
-- β
Upload and manage datasets
-- β
CSV, Parquet, and JSONL formats
-- β
Integration with Explorer DataFrames
-
-### What's Different
-
-- **DataFrame Library**: Uses Explorer instead of Pandas
-- **Data Processing**: Leverage Explorer's powerful API for transformations
-- **Concurrency**: Built on Elixir's process model for true parallelism
-- **Simplicity**: Focused API without custom dataset scripts
-
-## π Integration with Elixir ML Ecosystem
-
-### Axon (Neural Networks)
-
-```elixir
-{:ok, train} = ElixirDatasets.load_dataset({:hf, "mnist"})
-
-train_tensors = train
-|> Explorer.DataFrame.to_rows()
-|> Enum.map(fn row ->
- {Nx.tensor(row["image"]), Nx.tensor(row["label"])}
-end)
-
-model = Axon.input("input", shape: {nil, 784})
-|> Axon.dense(128, activation: :relu)
-|> Axon.dense(10, activation: :softmax)
-```
-
-### Bumblebee (Transformers)
-
-```elixir
-{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train")
-
-{:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
-
-texts = Explorer.DataFrame.pull(dataset, "text")
-inputs = Bumblebee.apply_tokenizer(tokenizer, texts)
-```
-
-### Nx (Numerical Computing)
-
-```elixir
-{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "california_housing"})
-
-features = dataset
-|> Explorer.DataFrame.select(["feature1", "feature2", "feature3"])
-|> Explorer.DataFrame.to_columns()
-|> Map.values()
-|> Enum.map(&Nx.tensor/1)
-|> Nx.stack()
-```
+- `HF_TOKEN` - Authentication token for private datasets
+- [π§ In-progress] `HF_DEBUG` - Enable debug logging (`"1"` or `"true"`)
## π Documentation
-Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets).
-
-### Key Modules
+Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets) and hosted on [GitHub Pages](https://radoslawrolka.github.io/ElixirDatasets/api-reference.html) for current status of under-development features. Documentation can be generated locally using:
-- `ElixirDatasets` - Main API for loading and managing datasets
-- `ElixirDatasets.DatasetInfo` - Dataset metadata management
-- `ElixirDatasets.Utils.Loader` - File loading utilities
-- `ElixirDatasets.Utils.Uploader` - Upload functionality
-- `ElixirDatasets.HuggingFace.Hub` - Hugging Face Hub integration
+```bash
+mix docs
+```
## π§ͺ Testing
```bash
-mix test
-
-mix coveralls
-
-mix test test/elixir_datasets_test.exs
+MIX_ENV=test mix test
```
-## π€ Contributing
-
-Contributions are welcome! Please feel free to submit a Pull Request.
-
-1. Fork the repository
-2. Create your feature branch (`git checkout -b feature/amazing-feature`)
-3. Commit your changes (`git commit -m 'Add amazing feature'`)
-4. Push to the branch (`git push origin feature/amazing-feature`)
-5. Open a Pull Request
-
## π License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
Copyright (c) 2025 RadosΕaw Rolka, Weronika Wojtas
-## π Acknowledgments
-
-- Inspired by [Hugging Face Datasets](https://github.com/huggingface/datasets)
-- Built with [Explorer](https://github.com/elixir-nx/explorer) for DataFrame operations
-- Uses [Req](https://github.com/wojtekmach/req) for HTTP requests
-
-## π Support
-
-- π [Documentation](https://hexdocs.pm/elixir_datasets)
-- π [Issue Tracker](https://github.com/yourusername/elixir_datasets/issues)
-- π¬ [Discussions](https://github.com/yourusername/elixir_datasets/discussions)
-
---
diff --git a/examples/example_1.livemd b/examples/example_1.livemd
deleted file mode 100644
index 953331a..0000000
--- a/examples/example_1.livemd
+++ /dev/null
@@ -1,821 +0,0 @@
-
-
-# Example_1
-
-```elixir
-##### Target version
-# Install dependencies
-# Mix.install([
-# {:elixir_datasets, "0.0.1"}
-# ])
-##### Local dev-testing version
-Mix.install([
- {:elixir_datasets, path: "#{__DIR__}/.."}
-])
-
-# get auth_token explicitly for downloading from HuggingFace
-auth_token = System.get_env("HF_TOKEN")
-:ok
-```
-
-## Import library
-
-```elixir
-import ElixirDatasets
-```
-
-
-
-```
-ElixirDatasets
-```
-
-## Load dataset
-
-This section demonstrates all the ways to load datasets using `ElixirDatasets.load_dataset/2`.
-
-### Basic Loading
-
-#### Load dataset from Huggingface
-
-```elixir
-ElixirDatasets.load_dataset({:hf, "fka/awesome-chatgpt-prompts"})
-```
-
-
-
-```
-{:ok,
- [
- #Explorer.DataFrame<
- Polars[948 x 5]
- act string ["Ethereum Developer", "Linux Terminal", "English Translator and Improver",
- "Job Interviewer", "JavaScript Console", ...]
- prompt string ["Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.",
- "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd",
- "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations. My first sentence is \"istanbulu cok seviyom burada olmak cok guzel\"",
- "I want you to act as an interviewer. I will be the candidate and you will ask me the interview questions for the ${Position:Software Developer} position. I want you to only reply as the interviewer. Do not write all the conversation at once. I want you to only do the interview with me. Ask me the questions and wait for my answers. Do not write explanations. Ask me the questions one by one like an interviewer does and wait for my answers.\n\nMy first sentence is \"Hi\"",
- "I want you to act as a javascript console. I will type commands and you will reply with what the javascript console should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is console.log(\"Hello World\");",
- ...]
- for_devs boolean [true, true, false, false, true, ...]
- type string ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", ...]
- contributor string ["ameya-2003", "f", "f", "f,iltekin", "omerimzali", ...]
- >
- ]}
-```
-
-#### Load dataset from Huggingface from given subdir
-
-```elixir
-ElixirDatasets.load_dataset(
- {:hf, "stanfordnlp/imdb", subdir: "plain_text"})
-```
-
-
-
-```
-{:ok,
- [
- #Explorer.DataFrame<
- Polars[25000 x 2]
- text string ["I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichΓ©d and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
- "Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.
The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.
I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. There was an emotional lift as the really bad ones got their just deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found really annoying was the constant cuts to VDs daughter during the last fight scene.
Not bad. Not good. Passable 4.",
- "its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and seem to be added in to use up time. the end plot is that of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the beginning. the end scene with the flask backs don't make sense as they are added in and seem to have little relevance to the history of van dam's character. not really worth watching again, bit disappointed in the end production, even though it is apparent it was shot on a low budget certain shots and sections in the film are of poor directed quality",
- "STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning
Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.
After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that first made him famous and has sadly produced his worst film since Derailed. It's nowhere near as bad as that film, but what I said still stands.
A dull, predictable film, with very little in the way of any exciting action. What little there is mainly consists of some limp fight scenes, trying to look cool and trendy with some cheap slo-mo/sped up effects added to them that sadly instead make them look more desperate. Being a Mexican set film, director Isaac Florentine has tried to give the film a Robert Rodriguez/Desperado sort of feel, but this only adds to the desperation.
VD gives a particularly uninspired performance and given he's never been a Robert De Niro sort of actor, that can't be good. As the villain, Lord shouldn't expect to leave the beeb anytime soon. He gets little dialogue at the beginning as he struggles to muster an American accent but gets mysteriously better towards the end. All the supporting cast are equally bland, and do nothing to raise the films spirits at all.
This is one shepherd that's strayed right from the flock. *",
- "First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!",
- ...]
- label s64 [0, 0, 0, 0, 0, ...]
- >,
- #Explorer.DataFrame<
- Polars[25000 x 2]
- text string ["I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.
The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.
What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.
I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.",
- "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.",
- "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.
One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).
One might better spend one's time staring out a window at a tree growing.
",
- "This film was probably inspired by Godard's Masculin, fΓ©minin and I urge you to see that film instead.
The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.
A movie of its time, and place. 2/10.",
- "Oh, brother...after hearing about this ridiculous film for umpteen years all I can think of is that old Peggy Lee song..
\"Is that all there is??\" ...I was just an early teen when this smoked fish hit the U.S. I was too young to get in the theater (although I did manage to sneak into \"Goodbye Columbus\"). Then a screening at a local film museum beckoned - Finally I could see this film, except now I was as old as my parents were when they schlepped to see it!!
The ONLY reason this film was not condemned to the anonymous sands of time was because of the obscenity case sparked by its U.S. release. MILLIONS of people flocked to this stinker, thinking they were going to see a sex film...Instead, they got lots of closeups of gnarly, repulsive Swedes, on-street interviews in bland shopping malls, asinie political pretension...and feeble who-cares simulated sex scenes with saggy, pale actors.
Cultural icon, holy grail, historic artifact..whatever this thing was, shred it, burn it, then stuff the ashes in a lead box!
Elite esthetes still scrape to find value in its boring pseudo revolutionary political spewings..But if it weren't for the censorship scandal, it would have been ignored, then forgotten.
Instead, the \"I Am Blank, Blank\" rhythymed title was repeated endlessly for years as a titilation for porno films (I am Curious, Lavender - for gay films, I Am Curious, Black - for blaxploitation films, etc..) and every ten years or so the thing rises from the dead, to be viewed by a new generation of suckers who want to see that \"naughty sex film\" that \"revolutionized the film industry\"...
Yeesh, avoid like the plague..Or if you MUST see it - rent the video and fast forward to the \"dirty\" parts, just to get it over with.
",
- ...]
- label s64 [0, 0, 0, 0, 0, ...]
- >,
- #Explorer.DataFrame<
- Polars[50000 x 2]
- text string ["This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie \"Leon\" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please \"Frankie Starlight\", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the \"Point of no return\" and especially the \"La femme Nikita\" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which \"translate\" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)",
- "When I say this is my favourite film of all time, that comment is not to be taken lightly. I probably watch far too many films than is healthy for me, and have loved quite a few of them. I first saw \"La Femme Nikita\" nearly ten years ago, and it still manages to be my absolute favourite. Why?
This is more than an incredibly stylish and sexy thriller. Luc Besson's great flair for impeccable direction, fashion, and appropriate usage of music makes this a very watchable film. But it is Anne Parillaud's perfect rendering of a complex character who transforms from a heartless killer into a compassionate, vibrant young woman that makes this film beautiful. I can't keep my eyes off of her when she is on screen.
I have seen several of Luc Besson's films including \"Subway\", \"The Professional\", and the irritating \"Fifth Element\", and \"Nikita\" is without a doubt, far superior to any of these. Although this film has tragic elements, it is ultimately extremely hopeful. It is the story of a person who is cruel and merciless, who ultimately comes to realize her own humanity and her own personal power. That, to me is extremely inspiring. If there is hope for Nikita, there is hope for all of us.",
- "I saw this movie because I am a huge fan of the TV series of the same name starring Roy Dupuis and Pet Wilson. The movie was really good and I saw how the TV show is based on the movie. A few episodes of the TV series came directly from the movie and their similarity was amazing. To keep things short, any fan of the movie has to watch the series and any fan of the series must see the original Nikita.",
- "Being that the only foreign films I usually like star a Japanese person in a rubber suit who crushes little tiny buildings and tanks, I had high hopes for this movie. I thought that this was a movie that wouldn't put me to sleep. WRONG! Starts off with a bang, okay, now she's in training, alright, she's an assassin, I'm still with you, oh, now she's having this moral dilemma and she can't decide if she loves her boyfriend or her controller, zzzzz.... Oh well, back to Gamera!",
- "After seeing Point of No Return (a great movie) and being told that the original was better, I was certainly thrilled to see that one of the indie film channels was running La Femme Nikita. Then I saw the movie. Ouch! This was a major let-down.
Nikita herself reminds me of Jar Jar Binks more than any other character I've seen recently. She comes across entirely as comic relief. The movie simply has nothing to recommend it besides the core concept of an evil, inhuman character paradoxically learning to be human while training as an assassin, and that concept failed miserably in Nikita due to the poor writing of the title role.",
- ...]
- label s64 [-1, -1, -1, -1, -1, ...]
- >
- ]}
-```
-
-### Load dataset from Huggingface with auth token as option
-
-```elixir
-ElixirDatasets.load_dataset!(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- %{auth_token: auth_token})
-```
-
-
-
-```
-[
- #Explorer.DataFrame<
- Polars[1066 x 2]
- text string ["lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .",
- "consistently clever and suspenseful .",
- "it's like a \" big chill \" reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .",
- "the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .",
- "red dragon \" never cuts corners .", ...]
- label s64 [1, 1, 1, 1, 1, ...]
- >,
- #Explorer.DataFrame<
- Polars[8530 x 2]
- text string ["the rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
- "the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .",
- "effective but too-tepid biopic",
- "if you sometimes like to go to the movies to have fun , wasabi is a good place to start .",
- "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",
- ...]
- label s64 [1, 1, 1, 1, 1, ...]
- >,
- #Explorer.DataFrame<
- Polars[1066 x 2]
- text string ["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .",
- "the soundtrack alone is worth the price of admission .",
- "rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .",
- "beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .",
- "bielinsky is a filmmaker of impressive talent .", ...]
- label s64 [1, 1, 1, 1, 1, ...]
- >
-]
-```
-
-### Load dataset from local resources
-
-```elixir
-ElixirDatasets.load_dataset({:local, "#{__DIR__}/../resources"})
-```
-
-
-
-```
-{:ok,
- [
- #Explorer.DataFrame<
- Polars[11 x 2]
- id s64 [0, 1, 2, 3, 4, ...]
- number string ["csv", "one", "two", "three", "four", ...]
- >,
- #Explorer.DataFrame<
- Polars[11 x 2]
- id s64 [0, 1, 2, 3, 4, ...]
- number string ["jsonl", "one", "two", "three", "four", ...]
- >,
- #Explorer.DataFrame<
- Polars[11 x 2]
- id s64 [0, 1, 2, 3, 4, ...]
- number string ["parquet", "one", "two", "three", "four", ...]
- >
- ]}
-```
-
-### Advanced Loading Options
-
-The `load_dataset` function supports several parameters for flexible data loading:
-
-* **`split`**: Load only a specific split (e.g., "train", "validation", "test")
-* **`name`**: Filter files by matching a pattern in the filename/path
-* **`streaming`**: Return file paths instead of loading data into memory
-* **`download_mode`**: Control caching behavior (`:reuse_dataset_if_exists`, `:force_redownload`)
-* **`verification_mode`**: Control validation checks (`:basic_checks`, `:no_checks`)
-* **`num_proc`**: Number of parallel processes for faster loading
-* **`cache_dir`**: Custom cache directory location
-* **`offline`**: Only use cached files, no network requests
-
-**Note**: For datasets with subdirectories (like GLUE), use the existing `subdir` option in the repository tuple: `{:hf, "dataset-name", subdir: "config-name"}`. The `name` parameter is for filtering files within a directory by matching patterns in filenames.
-
-#### Load only a specific split
-
-Load only the training split from a dataset:
-
-```elixir
-{:ok, train_data} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "train"
-)
-
-IO.puts("Loaded #{length(train_data)} dataset(s) from 'train' split")
-[train_df] = train_data
-IO.puts("Number of training examples: #{Explorer.DataFrame.n_rows(train_df)}")
-```
-
-
-
-```
-Loaded 1 dataset(s) from 'train' split
-Number of training examples: 8530
-```
-
-
-
-```
-:ok
-```
-
-#### Load a specific dataset configuration
-
-For datasets with multiple configurations (like GLUE), use the `subdir` option:
-
-```elixir
-{:ok, sst2_data} = ElixirDatasets.load_dataset(
- {:hf, "nyu-mll/glue", subdir: "sst2"}
-)
-
-IO.puts("Loaded #{length(sst2_data)} dataset(s) from 'sst2' configuration")
-```
-
-
-
-```
-Loaded 3 dataset(s) from 'sst2' configuration
-```
-
-
-
-```
-:ok
-```
-
-#### Combine split and subdir parameters
-
-Load a specific split from a specific configuration:
-
-```elixir
-{:ok, sst2_train} = ElixirDatasets.load_dataset(
- {:hf, "nyu-mll/glue", subdir: "sst2"},
- split: "train"
-)
-
-IO.puts("Loaded #{length(sst2_train)} dataset(s) from 'sst2' configuration, 'train' split")
-```
-
-
-
-```
-Loaded 1 dataset(s) from 'sst2' configuration, 'train' split
-```
-
-
-
-```
-:ok
-```
-
-#### Streaming mode for large datasets
-
-When `streaming: true`, you get a Stream that yields rows on-demand:
-
-```elixir
-{:ok, stream} = ElixirDatasets.load_dataset(
- {:local, "#{__DIR__}/../resources"},
- streaming: true
-)
-
-IO.puts("β Created stream (no data loaded yet!)")
-IO.puts(" Stream type: #{inspect(is_function(stream, 2))}")
-
-IO.puts("\nFetching first 5 rows progressively...")
-rows = stream
-|> Stream.take(5)
-|> Enum.to_list()
-
-IO.puts("β Fetched #{length(rows)} rows")
-rows |> Enum.with_index(1) |> Enum.each(fn {row, idx} ->
- keys = Map.keys(row) |> Enum.join(", ")
- IO.puts(" Row #{idx}: [#{keys}]")
-end)
-```
-
-
-
-```
-β Created stream (no data loaded yet!)
- Stream type: true
-
-Fetching first 5 rows progressively...
-β Fetched 5 rows
- Row 1: [id, number]
- Row 2: [id, number]
- Row 3: [id, number]
- Row 4: [id, number]
- Row 5: [id, number]
-```
-
-
-
-```
-:ok
-```
-
-You can also control batch size and use Stream operations:
-
-```elixir
-{:ok, stream} = ElixirDatasets.load_dataset(
- {:local, "#{__DIR__}/../resources"},
- streaming: true,
- batch_size: 2
-)
-
-result = stream
-|> Stream.filter(fn row -> Map.has_key?(row, "id") end)
-|> Stream.map(fn row -> "ID: #{row["id"]}" end)
-|> Stream.take(3)
-|> Enum.to_list()
-
-IO.puts("Filtered and mapped results:")
-result |> Enum.each(&IO.puts(" #{&1}"))
-```
-
-
-
-```
-Filtered and mapped results:
- ID: 0
- ID: 1
- ID: 2
-```
-
-
-
-```
-:ok
-```
-
-**Streaming from HuggingFace:**
-
-```elixir
-{:ok, hf_stream} = ElixirDatasets.load_dataset(
- {:hf, "aaaaa32r/elixirDatasets"},
- streaming: true,
- batch_size: 5
-)
-
-IO.puts("Streaming from HuggingFace...")
-sample = hf_stream |> Enum.take(3)
-IO.puts("β Fetched #{length(sample)} rows from HuggingFace")
-```
-
-
-
-```
-Streaming from HuggingFace...
-β Fetched 3 rows from HuggingFace
-```
-
-
-
-```
-:ok
-```
-
-#### Parallel processing with num_proc
-
-Use `num_proc` to load multiple files in parallel:
-
-```elixir
-IO.puts("Loading with num_proc: 1 (sequential)...")
-{time_seq, {:ok, datasets_seq}} = :timer.tc(fn ->
- ElixirDatasets.load_dataset(
- {:hf, "aaaaa32r/elixirDatasets"},
- num_proc: 1
- )
-end)
-
-IO.puts("Loading with num_proc: 4 (parallel)...")
-{time_par, {:ok, datasets_par}} = :timer.tc(fn ->
- ElixirDatasets.load_dataset(
- {:hf, "aaaaa32r/elixirDatasets"},
- num_proc: 4
- )
-end)
-
-time_seq_sec = time_seq / 1_000_000
-time_par_sec = time_par / 1_000_000
-speedup = time_seq / time_par
-
-IO.puts(" Performance Comparison:")
-IO.puts(" Sequential (num_proc: 1): #{Float.round(time_seq_sec, 3)}s")
-IO.puts(" Parallel (num_proc: 4): #{Float.round(time_par_sec, 3)}s")
-IO.puts(" Speedup: #{Float.round(speedup, 2)}x")
-IO.puts(" Datasets loaded: #{length(datasets_par)}")
-```
-
-
-
-```
-Loading with num_proc: 1 (sequential)...
-Loading with num_proc: 4 (parallel)...
- Performance Comparison:
- Sequential (num_proc: 1): 0.282s
- Parallel (num_proc: 4): 0.284s
- Speedup: 0.99x
- Datasets loaded: 4
-```
-
-
-
-```
-:ok
-```
-
-#### Filter datasets by name pattern
-
-The `name` parameter filters files by matching the name in the file path:
-
-```elixir
-# Load only files containing "csv" in their filename
-{:ok, csv_only} = ElixirDatasets.load_dataset(
- {:local, "#{__DIR__}/../resources"},
- name: "csv"
-)
-
-IO.puts("Loaded #{length(csv_only)} dataset(s) matching 'csv'")
-[csv_df] = csv_only
-IO.inspect(csv_df)
-```
-
-
-
-```
-Loaded 1 dataset(s) matching 'csv'
-#Explorer.DataFrame<
- Polars[11 x 2]
- id s64 [0, 1, 2, 3, 4, ...]
- number string ["csv", "one", "two", "three", "four", ...]
->
-```
-
-
-
-```
-#Explorer.DataFrame<
- Polars[11 x 2]
- id s64 [0, 1, 2, 3, 4, ...]
- number string ["csv", "one", "two", "three", "four", ...]
->
-```
-
-
-
-```
-Dataset: aaaaa32r/elixirDatasets
-Config: csv
-Features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}]
-Training examples: 10
-```
-
-
-
-```
-:ok
-```
-
-#### Real-world use case - Training/Validation split
-
-A typical ML workflow loading separate train and validation sets:
-
-```elixir
-{:ok, [train_df]} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "train"
-)
-
-{:ok, [val_df]} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "validation"
-)
-
-IO.puts("Training examples: #{Explorer.DataFrame.n_rows(train_df)}")
-IO.puts("Validation examples: #{Explorer.DataFrame.n_rows(val_df)}")
-```
-
-
-
-```
-Training examples: 8530
-Validation examples: 1066
-```
-
-
-
-```
-:ok
-```
-
-#### Force redownload with download_mode
-
-Use `download_mode` to control caching behavior:
-
-```elixir
-{:ok, [fresh_data]} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "train",
- download_mode: :force_redownload
-)
-
-IO.puts("Freshly downloaded dataset has #{Explorer.DataFrame.n_rows(fresh_data)} rows")
-```
-
-
-
-```
-Freshly downloaded dataset has 8530 rows
-```
-
-
-
-```
-:ok
-```
-
-Available `download_mode` options:
-
-* `:reuse_dataset_if_exists` (default) - Use cached data if available
-* `:force_redownload` - Always download fresh, even if cached
-
-#### Skip verification with verification_mode
-
-Use `verification_mode` to control validation checks:
-
-```elixir
-{:ok, [quick_data]} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "validation",
- verification_mode: :no_checks
-)
-
-IO.puts("Loaded #{Explorer.DataFrame.n_rows(quick_data)} rows (skipping verification)")
-```
-
-
-
-```
-Loaded 1066 rows (skipping verification)
-```
-
-
-
-```
-:ok
-```
-
-Available `verification_mode` options:
-
-* `:basic_checks` (default) - Basic validation including file existence
-* `:no_checks` - Skip all validation for faster loading
-
-#### Combining multiple advanced options
-
-Combine data loading options with Hub options for maximum control:
-
-```elixir
-{:ok, file_paths} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "test",
- streaming: true,
- download_mode: :force_redownload,
- verification_mode: :no_checks
-)
-
-IO.puts("Got #{length(file_paths)} file path(s) in streaming mode")
-Enum.each(file_paths, fn {path, ext} ->
- IO.puts(" - #{Path.basename(path)} (#{ext})")
-end)
-```
-
-
-
-```
-** (ArgumentError) errors were found at the given arguments:
-
- * 1st argument: not a list
-
- (erts 15.2.7) :erlang.length(#Function<53.117496853/2 in Stream.resource/3>)
- #cell:o36uvndfgwkvdreh:9: (file)
-```
-
-#### Using custom cache directory
-
-Control where downloaded files are stored:
-
-```elixir
-custom_cache = "/tmp/my_datasets_cache"
-
-{:ok, [cached_data]} = ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "train",
- cache_dir: custom_cache
-)
-
-IO.puts("Dataset cached in: #{custom_cache}")
-IO.puts("Loaded #{Explorer.DataFrame.n_rows(cached_data)} rows")
-```
-
-
-
-```
-Dataset cached in: /tmp/my_datasets_cache
-Loaded 8530 rows
-```
-
-
-
-```
-:ok
-```
-
-#### Offline mode
-
-Work with cached datasets without network access:
-
-```elixir
-case ElixirDatasets.load_dataset(
- {:hf, "cornell-movie-review-data/rotten_tomatoes"},
- split: "train",
- offline: true
-) do
- {:ok, [offline_data]} ->
- IO.puts("β Loaded from cache: #{Explorer.DataFrame.n_rows(offline_data)} rows")
-
- {:error, reason} ->
- IO.puts("β Not in cache: #{reason}")
-end
-```
-
-
-
-```
-β Loaded from cache: 8530 rows
-```
-
-
-
-```
-:ok
-```
-
-## Upload dataset
-
-### Prepare datasets to upload
-
-```elixir
-[ df_head | df_tail ] = ElixirDatasets.load_dataset!({:local, "#{__DIR__}/../resources"})
-nil
-```
-
-
-
-```
-nil
-```
-
-### Upload dataset to huggingface hub
-
-```elixir
-# Commented out to avoid cluttering the repository
-# ElixirDatasets.upload_dataset(
-# df_head,
-# "aaaaa32r/elixirDatasets",
-# [file_extension: "csv"])
-```
-
-
-
-```
-nil
-```
-
-### Delete dataset file from huggingface hub
-
-```elixir
-# Commented out to avoid cluttering the repository
-# ElixirDatasets.Utils.Uploader.delete_file_from_dataset(
-# "aaaaa32r/elixirDatasets",
-# "briefly-576460442698708888-7FDZDhwtp6dOsH5dAT")
-```
-
-
-
-```
-nil
-```
-
-### Upload dataset to huggingface hub via lfs
-
-```elixir
-# Commented out to avoid cluttering the repository
-# ElixirDatasets.Utils.Uploader.upload_file_via_lfs(
-# "/Users/radoslawrolka/Downloads/companies-2023-q4-sm.csv.zip",
-# "aaaaa32r/elixirDatasets")
-```
-
-
-
-```
-nil
-```
-
-## Other loading methods
-
-### Get dataset infos
-
-```elixir
-ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes")
-```
-
-
-
-```
-{:ok,
- [
- %ElixirDatasets.DatasetInfo{
- config_name: nil,
- features: [
- %{"dtype" => "string", "name" => "text"},
- %{
- "dtype" => %{"class_label" => %{"names" => %{"0" => "neg", "1" => "pos"}}},
- "name" => "label"
- }
- ],
- splits: [
- %{"name" => "train", "num_bytes" => 1074810, "num_examples" => 8530},
- %{"name" => "validation", "num_bytes" => 134679, "num_examples" => 1066},
- %{"name" => "test", "num_bytes" => 135972, "num_examples" => 1066}
- ],
- description: nil,
- homepage: nil,
- license: nil,
- citation: nil
- }
- ]}
-```
-
-### Get dataset split names
-
-```elixir
-ElixirDatasets.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")
-```
-
-
-
-```
-{:ok, ["train", "validation", "test"]}
-```
-
-### Get dataset config names
-
-```elixir
-ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets")
-```
-
-
-
-```
-{:ok, ["csv", "default"]}
-```
-
-### Write-to-file & read-from-file datasetInfo
-
-```elixir
-{:ok, dataset_info} = ElixirDatasets.get_dataset_infos("aaaaa32r/elixirDatasets")
-ElixirDatasets.DatasetInfo.write_to_directory(dataset_info, "my-dir")
-ElixirDatasets.DatasetInfo.from_directory("my-dir")
-```
-
-
-
-```
-{:ok,
- [
- %ElixirDatasets.DatasetInfo{
- config_name: "csv",
- features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}],
- splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}],
- description: nil,
- homepage: nil,
- license: nil,
- citation: nil
- },
- %ElixirDatasets.DatasetInfo{
- config_name: "default",
- features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}],
- splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}],
- description: nil,
- homepage: nil,
- license: nil,
- citation: nil
- }
- ]}
-```
diff --git a/examples/integration_examples.livemd b/examples/integration_examples.livemd
new file mode 100644
index 0000000..8eb98e8
--- /dev/null
+++ b/examples/integration_examples.livemd
@@ -0,0 +1,213 @@
+# Integration Examples
+
+```elixir
+Mix.install([
+ {:elixir_datasets, "~> 0.1.0"},
+ {:nx, "~> 0.7"},
+ {:axon, "~> 0.6"},
+ {:bumblebee, "~> 0.5"},
+ {:kino, "~> 0.12"}
+])
+```
+
+## Setup
+
+```elixir
+auth_token = System.get_env("HF_TOKEN")
+:ok
+```
+
+## Integration with Nx
+
+### Convert DataFrame to Nx Tensors
+
+Load a dataset and convert it to Nx tensors for numerical computing:
+
+```elixir
+{:ok, [train_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train"
+)
+
+labels =
+ train_df
+ |> Explorer.DataFrame.pull("label")
+ |> Explorer.Series.to_list()
+ |> Nx.tensor()
+
+IO.puts("Labels tensor shape: #{inspect(Nx.shape(labels))}")
+IO.puts("Labels tensor type: #{inspect(Nx.type(labels))}")
+IO.inspect(labels[0..9], label: "First 10 labels")
+```
+
+### Prepare Data for Training
+
+```elixir
+{:ok, [train_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train"
+)
+
+{:ok, [val_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "validation"
+)
+
+train_labels = train_df |> Explorer.DataFrame.pull("label") |> Explorer.Series.to_list() |> Nx.tensor()
+val_labels = val_df |> Explorer.DataFrame.pull("label") |> Explorer.Series.to_list() |> Nx.tensor()
+
+IO.puts("Training samples: #{Nx.size(train_labels)}")
+IO.puts("Validation samples: #{Nx.size(val_labels)}")
+
+positive_count = train_labels |> Nx.sum() |> Nx.to_number()
+total_count = Nx.size(train_labels)
+IO.puts("Positive class ratio: #{Float.round(positive_count / total_count, 3)}")
+```
+
+## Integration with Bumblebee
+
+### Fill-Mask with DistilBERT (Quick Demo)
+
+Demonstrate Bumblebee integration with ElixirDatasets:
+
+```elixir
+{:ok, model_info} = Bumblebee.load_model({:hf, "distilbert/distilbert-base-uncased"})
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert/distilbert-base-uncased"})
+
+serving = Bumblebee.Text.fill_mask(model_info, tokenizer)
+
+IO.puts("β Model loaded successfully!")
+```
+
+Use the model with data from ElixirDatasets:
+
+```elixir
+{:ok, [test_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "validation"
+)
+
+sample_text =
+ test_df
+ |> Explorer.DataFrame.slice(0, 1)
+ |> Explorer.DataFrame.pull("text")
+ |> Explorer.Series.first()
+
+IO.puts("\n=== Dataset Sample ===")
+IO.puts("Dataset: rotten_tomatoes")
+IO.puts("Sample: #{String.slice(sample_text, 0, 80)}...")
+
+masked_text = "This movie is [MASK]."
+
+IO.puts("\nRunning inference (first run compiles ~1-2 min)...")
+
+result = Nx.Serving.run(serving, masked_text)
+top = result.predictions |> List.first()
+
+IO.puts("\n=== Fill-Mask Result ===")
+IO.puts("Input: #{masked_text}")
+IO.puts("Predicted: '#{top.token}' (score: #{Float.round(top.score, 3)})")
+```
+
+## Integration with Axon
+
+### Build a Simple Neural Network
+
+Create a text classification model using Axon with data from ElixirDatasets:
+
+```elixir
+{:ok, [train_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train"
+)
+
+model =
+ Axon.input("input", shape: {nil, 100})
+ |> Axon.dense(64, activation: :relu)
+ |> Axon.dropout(rate: 0.5)
+ |> Axon.dense(32, activation: :relu)
+ |> Axon.dense(2, activation: :softmax)
+
+Axon.Display.as_graph(model, Nx.template({1, 100}, :f32))
+```
+
+### Streaming Data for Training
+
+Use streaming to efficiently process large datasets:
+
+```elixir
+{:ok, stream} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train",
+ streaming: true
+)
+
+batch_size = 32
+
+batched_stream =
+ stream
+ |> Stream.chunk_every(batch_size)
+ |> Stream.take(5)
+
+IO.puts("Processing batches of #{batch_size} samples:\n")
+for {batch, idx} <- Enum.with_index(batched_stream, 1) do
+ labels = batch |> Enum.map(& &1["label"]) |> Nx.tensor()
+ IO.puts("Batch #{idx}: #{length(batch)} samples, labels shape: #{inspect(Nx.shape(labels))}")
+end
+```
+
+## Advanced: Custom Data Pipeline
+
+### Combine ElixirDatasets with Nx and Axon for End-to-End Training
+
+```elixir
+defmodule DataPipeline do
+ @doc """
+ Creates a data pipeline that loads, preprocesses, and batches data
+ """
+ def create_pipeline(dataset_name, split, batch_size) do
+ {:ok, stream} = ElixirDatasets.load_dataset(
+ {:hf, dataset_name},
+ split: split,
+ streaming: true
+ )
+
+ stream
+ |> Stream.chunk_every(batch_size)
+ |> Stream.map(&prepare_batch/1)
+ end
+
+ defp prepare_batch(batch) do
+ labels =
+ batch
+ |> Enum.map(& &1["label"])
+ |> Nx.tensor()
+
+ texts = Enum.map(batch, & &1["text"])
+
+ {texts, labels}
+ end
+end
+
+pipeline = DataPipeline.create_pipeline(
+ "cornell-movie-review-data/rotten_tomatoes",
+ "train",
+ 16
+)
+
+{texts, labels} = Enum.at(pipeline, 0)
+IO.puts("Batch size: #{length(texts)}")
+IO.puts("Labels shape: #{inspect(Nx.shape(labels))}")
+IO.puts("Sample text: #{List.first(texts) |> String.slice(0..100)}...")
+```
+
+## Summary
+
+This notebook demonstrates how to integrate ElixirDatasets with:
+
+* **Nx**: Convert DataFrames to tensors for numerical computing
+* **Bumblebee**: Use pre-trained models with loaded datasets
+* **Axon**: Build and train neural networks with dataset streams
+* **Custom Pipelines**: Create efficient data processing workflows
+
+These integrations enable you to build complete machine learning pipelines in Elixir!
diff --git a/examples/usage_examples.livemd b/examples/usage_examples.livemd
new file mode 100644
index 0000000..adeacd2
--- /dev/null
+++ b/examples/usage_examples.livemd
@@ -0,0 +1,881 @@
+
+
+# Usage examples
+
+```elixir
+# Install dependencies
+Mix.install([
+ {:elixir_datasets, "0.1.0"}
+# {:elixir_datasets, path: "#{__DIR__}/.."} # Local dev-testing version
+])
+
+# get auth_token explicitly for downloading from HuggingFace
+auth_token = System.get_env("HF_TOKEN")
+:ok
+```
+
+## Import library
+
+```elixir
+import ElixirDatasets
+```
+
+
+
+```
+ElixirDatasets
+```
+
+## Load dataset
+
+This section demonstrates all the ways to load datasets using `ElixirDatasets.load_dataset/2`.
+
+### Basic Loading
+
+#### Load dataset from Huggingface
+
+Load a dataset directly from the Hugging Face Hub.
+
+```elixir
+{:ok, datasets} = ElixirDatasets.load_dataset({:hf, "fka/awesome-chatgpt-prompts"})
+IO.puts("Loaded #{length(datasets)} dataset(s)")
+datasets
+```
+
+
+
+```
+|===============================================================| 100% (1.51 MB)
+Loaded 1 dataset(s)
+```
+
+
+
+```
+[
+ #Explorer.DataFrame<
+ Polars[983 x 5]
+ act string ["Ethereum Developer", "Linux Terminal", "English Translator and Improver",
+ "Job Interviewer", "JavaScript Console", ...]
+ prompt string ["Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.",
+ "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd",
+ "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations. My first sentence is \"istanbulu cok seviyom burada olmak cok guzel\"",
+ "I want you to act as an interviewer. I will be the candidate and you will ask me the interview questions for the ${Position:Software Developer} position. I want you to only reply as the interviewer. Do not write all the conversation at once. I want you to only do the interview with me. Ask me the questions and wait for my answers. Do not write explanations. Ask me the questions one by one like an interviewer does and wait for my answers.\n\nMy first sentence is \"Hi\"",
+ "I want you to act as a javascript console. I will type commands and you will reply with what the javascript console should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is console.log(\"Hello World\");",
+ ...]
+ for_devs boolean [true, true, false, false, true, ...]
+ type string ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", ...]
+ contributor string ["ameya-2003", "f", "f", "f,iltekin", "omerimzali", ...]
+ >
+]
+```
+
+#### Load dataset from Huggingface from given subdir
+
+Some datasets have multiple configurations in subdirectories.
+
+```elixir
+{:ok, datasets} = ElixirDatasets.load_dataset(
+ {:hf, "stanfordnlp/imdb", subdir: "plain_text"}
+)
+IO.puts("Loaded #{length(datasets)} dataset(s) from 'plain_text' configuration")
+datasets
+```
+
+
+
+```
+|==============================================================| 100% (20.47 MB)
+|==============================================================| 100% (20.97 MB)
+|==============================================================| 100% (41.99 MB)
+Loaded 3 dataset(s) from 'plain_text' configuration
+```
+
+
+
+```
+[
+ #Explorer.DataFrame<
+ Polars[25000 x 2]
+ text string ["I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichΓ©d and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+ "Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.
The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.
I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. There was an emotional lift as the really bad ones got their just deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found really annoying was the constant cuts to VDs daughter during the last fight scene.
Not bad. Not good. Passable 4.",
+ "its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and seem to be added in to use up time. the end plot is that of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the beginning. the end scene with the flask backs don't make sense as they are added in and seem to have little relevance to the history of van dam's character. not really worth watching again, bit disappointed in the end production, even though it is apparent it was shot on a low budget certain shots and sections in the film are of poor directed quality",
+ "STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning
Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.
After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that first made him famous and has sadly produced his worst film since Derailed. It's nowhere near as bad as that film, but what I said still stands.
A dull, predictable film, with very little in the way of any exciting action. What little there is mainly consists of some limp fight scenes, trying to look cool and trendy with some cheap slo-mo/sped up effects added to them that sadly instead make them look more desperate. Being a Mexican set film, director Isaac Florentine has tried to give the film a Robert Rodriguez/Desperado sort of feel, but this only adds to the desperation.
VD gives a particularly uninspired performance and given he's never been a Robert De Niro sort of actor, that can't be good. As the villain, Lord shouldn't expect to leave the beeb anytime soon. He gets little dialogue at the beginning as he struggles to muster an American accent but gets mysteriously better towards the end. All the supporting cast are equally bland, and do nothing to raise the films spirits at all.
This is one shepherd that's strayed right from the flock. *",
+ "First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!",
+ ...]
+ label s64 [0, 0, 0, 0, 0, ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[25000 x 2]
+ text string ["I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.
The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.
What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.
I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.",
+ "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.",
+ "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.
One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).
One might better spend one's time staring out a window at a tree growing.
",
+ "This film was probably inspired by Godard's Masculin, fΓ©minin and I urge you to see that film instead.
The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.
A movie of its time, and place. 2/10.",
+ "Oh, brother...after hearing about this ridiculous film for umpteen years all I can think of is that old Peggy Lee song..
\"Is that all there is??\" ...I was just an early teen when this smoked fish hit the U.S. I was too young to get in the theater (although I did manage to sneak into \"Goodbye Columbus\"). Then a screening at a local film museum beckoned - Finally I could see this film, except now I was as old as my parents were when they schlepped to see it!!
The ONLY reason this film was not condemned to the anonymous sands of time was because of the obscenity case sparked by its U.S. release. MILLIONS of people flocked to this stinker, thinking they were going to see a sex film...Instead, they got lots of closeups of gnarly, repulsive Swedes, on-street interviews in bland shopping malls, asinie political pretension...and feeble who-cares simulated sex scenes with saggy, pale actors.
Cultural icon, holy grail, historic artifact..whatever this thing was, shred it, burn it, then stuff the ashes in a lead box!
Elite esthetes still scrape to find value in its boring pseudo revolutionary political spewings..But if it weren't for the censorship scandal, it would have been ignored, then forgotten.
Instead, the \"I Am Blank, Blank\" rhythymed title was repeated endlessly for years as a titilation for porno films (I am Curious, Lavender - for gay films, I Am Curious, Black - for blaxploitation films, etc..) and every ten years or so the thing rises from the dead, to be viewed by a new generation of suckers who want to see that \"naughty sex film\" that \"revolutionized the film industry\"...
Yeesh, avoid like the plague..Or if you MUST see it - rent the video and fast forward to the \"dirty\" parts, just to get it over with.
",
+ ...]
+ label s64 [0, 0, 0, 0, 0, ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[50000 x 2]
+ text string ["This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie \"Leon\" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please \"Frankie Starlight\", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the \"Point of no return\" and especially the \"La femme Nikita\" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which \"translate\" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)",
+ "When I say this is my favourite film of all time, that comment is not to be taken lightly. I probably watch far too many films than is healthy for me, and have loved quite a few of them. I first saw \"La Femme Nikita\" nearly ten years ago, and it still manages to be my absolute favourite. Why?
This is more than an incredibly stylish and sexy thriller. Luc Besson's great flair for impeccable direction, fashion, and appropriate usage of music makes this a very watchable film. But it is Anne Parillaud's perfect rendering of a complex character who transforms from a heartless killer into a compassionate, vibrant young woman that makes this film beautiful. I can't keep my eyes off of her when she is on screen.
I have seen several of Luc Besson's films including \"Subway\", \"The Professional\", and the irritating \"Fifth Element\", and \"Nikita\" is without a doubt, far superior to any of these. Although this film has tragic elements, it is ultimately extremely hopeful. It is the story of a person who is cruel and merciless, who ultimately comes to realize her own humanity and her own personal power. That, to me is extremely inspiring. If there is hope for Nikita, there is hope for all of us.",
+ "I saw this movie because I am a huge fan of the TV series of the same name starring Roy Dupuis and Pet Wilson. The movie was really good and I saw how the TV show is based on the movie. A few episodes of the TV series came directly from the movie and their similarity was amazing. To keep things short, any fan of the movie has to watch the series and any fan of the series must see the original Nikita.",
+ "Being that the only foreign films I usually like star a Japanese person in a rubber suit who crushes little tiny buildings and tanks, I had high hopes for this movie. I thought that this was a movie that wouldn't put me to sleep. WRONG! Starts off with a bang, okay, now she's in training, alright, she's an assassin, I'm still with you, oh, now she's having this moral dilemma and she can't decide if she loves her boyfriend or her controller, zzzzz.... Oh well, back to Gamera!",
+ "After seeing Point of No Return (a great movie) and being told that the original was better, I was certainly thrilled to see that one of the indie film channels was running La Femme Nikita. Then I saw the movie. Ouch! This was a major let-down.
Nikita herself reminds me of Jar Jar Binks more than any other character I've seen recently. She comes across entirely as comic relief. The movie simply has nothing to recommend it besides the core concept of an evil, inhuman character paradoxically learning to be human while training as an assassin, and that concept failed miserably in Nikita due to the poor writing of the title role.",
+ ...]
+ label s64 [-1, -1, -1, -1, -1, ...]
+ >
+]
+```
+
+### Load dataset from Huggingface with auth token as option
+
+Use an authentication token to access private datasets.
+
+```elixir
+datasets = ElixirDatasets.load_dataset!(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ auth_token: auth_token
+)
+IO.puts("Loaded #{length(datasets)} dataset(s) with authentication")
+datasets
+```
+
+
+
+```
+|==============================================================| 100% (92.20 KB)
+|=============================================================| 100% (698.84 KB)
+|==============================================================| 100% (90.00 KB)
+Loaded 3 dataset(s) with authentication
+```
+
+
+
+```
+[
+ #Explorer.DataFrame<
+ Polars[1066 x 2]
+ text string ["lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .",
+ "consistently clever and suspenseful .",
+ "it's like a \" big chill \" reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .",
+ "the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .",
+ "red dragon \" never cuts corners .", ...]
+ label s64 [1, 1, 1, 1, 1, ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[8530 x 2]
+ text string ["the rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
+ "the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .",
+ "effective but too-tepid biopic",
+ "if you sometimes like to go to the movies to have fun , wasabi is a good place to start .",
+ "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",
+ ...]
+ label s64 [1, 1, 1, 1, 1, ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[1066 x 2]
+ text string ["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .",
+ "the soundtrack alone is worth the price of admission .",
+ "rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .",
+ "beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .",
+ "bielinsky is a filmmaker of impressive talent .", ...]
+ label s64 [1, 1, 1, 1, 1, ...]
+ >
+]
+```
+
+### Load dataset from local resources
+
+Load datasets from local files (CSV, Parquet, or JSONL).
+
+```elixir
+{:ok, datasets} = ElixirDatasets.load_dataset({:local, "#{__DIR__}/../resources"})
+IO.puts("Loaded #{length(datasets)} dataset(s) from local directory")
+datasets
+```
+
+
+
+```
+Loaded 3 dataset(s) from local directory
+```
+
+
+
+```
+[
+ #Explorer.DataFrame<
+ Polars[11 x 2]
+ id s64 [0, 1, 2, 3, 4, ...]
+ number string ["csv", "one", "two", "three", "four", ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[11 x 2]
+ id s64 [0, 1, 2, 3, 4, ...]
+ number string ["jsonl", "one", "two", "three", "four", ...]
+ >,
+ #Explorer.DataFrame<
+ Polars[11 x 2]
+ id s64 [0, 1, 2, 3, 4, ...]
+ number string ["parquet", "one", "two", "three", "four", ...]
+ >
+]
+```
+
+### Advanced Loading Options
+
+The `load_dataset` function supports several parameters for flexible data loading:
+
+* **`split`**: Load only a specific split (e.g., "train", "validation", "test")
+* **`name`**: Filter files by matching a pattern in the filename/path
+* **`streaming`**: Return file paths instead of loading data into memory
+* **`download_mode`**: Control caching behavior (`:reuse_dataset_if_exists`, `:force_redownload`)
+* **`verification_mode`**: Control validation checks (`:basic_checks`, `:no_checks`)
+* **`num_proc`**: Number of parallel processes for faster loading
+* **`cache_dir`**: Custom cache directory location
+* **`offline`**: Only use cached files, no network requests
+
+**Note**: For datasets with subdirectories (like GLUE), use the existing `subdir` option in the repository tuple: `{:hf, "dataset-name", subdir: "config-name"}`. The `name` parameter is for filtering files within a directory by matching patterns in filenames.
+
+#### Load only a specific split
+
+Load only the training split from a dataset:
+
+```elixir
+{:ok, train_data} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train"
+)
+
+IO.puts("Loaded #{length(train_data)} dataset(s) from 'train' split")
+[train_df] = train_data
+IO.puts("Number of training examples: #{Explorer.DataFrame.n_rows(train_df)}")
+```
+
+
+
+```
+Loaded 1 dataset(s) from 'train' split
+Number of training examples: 8530
+```
+
+
+
+```
+:ok
+```
+
+#### Load a specific dataset configuration
+
+For datasets with multiple configurations (like GLUE), use the `subdir` option:
+
+```elixir
+{:ok, sst2_data} = ElixirDatasets.load_dataset(
+ {:hf, "nyu-mll/glue", subdir: "sst2"}
+)
+
+IO.puts("Loaded #{length(sst2_data)} dataset(s) from 'sst2' configuration")
+```
+
+
+
+```
+|=============================================================| 100% (147.79 KB)
+|===============================================================| 100% (3.11 MB)
+|==============================================================| 100% (72.81 KB)
+Loaded 3 dataset(s) from 'sst2' configuration
+```
+
+
+
+```
+:ok
+```
+
+#### Combine split and subdir parameters
+
+Load a specific split from a specific configuration:
+
+```elixir
+{:ok, sst2_train} = ElixirDatasets.load_dataset(
+ {:hf, "nyu-mll/glue", subdir: "sst2"},
+ split: "train"
+)
+
+IO.puts("Loaded #{length(sst2_train)} dataset(s) from 'sst2' configuration, 'train' split")
+```
+
+
+
+```
+Loaded 1 dataset(s) from 'sst2' configuration, 'train' split
+```
+
+
+
+```
+:ok
+```
+
+#### Streaming mode for large datasets
+
+When `streaming: true`, you get a Stream that yields rows on-demand:
+
+```elixir
+{:ok, stream} = ElixirDatasets.load_dataset(
+ {:local, "#{__DIR__}/../resources"},
+ streaming: true
+)
+
+IO.puts("β Created stream (no data loaded yet!)")
+IO.puts(" Stream type: #{inspect(is_function(stream, 2))}")
+
+IO.puts("\nFetching first 5 rows progressively...")
+rows = stream
+|> Stream.take(5)
+|> Enum.to_list()
+
+IO.puts("β Fetched #{length(rows)} rows")
+rows |> Enum.with_index(1) |> Enum.each(fn {row, idx} ->
+ keys = Map.keys(row) |> Enum.join(", ")
+ IO.puts(" Row #{idx}: [#{keys}]")
+end)
+```
+
+
+
+```
+β Created stream (no data loaded yet!)
+ Stream type: true
+
+Fetching first 5 rows progressively...
+β Fetched 5 rows
+ Row 1: [id, number]
+ Row 2: [id, number]
+ Row 3: [id, number]
+ Row 4: [id, number]
+ Row 5: [id, number]
+```
+
+
+
+```
+:ok
+```
+
+You can also control batch size and use Stream operations:
+
+```elixir
+{:ok, stream} = ElixirDatasets.load_dataset(
+ {:local, "#{__DIR__}/../resources"},
+ streaming: true,
+ batch_size: 2
+)
+
+result = stream
+|> Stream.filter(fn row -> Map.has_key?(row, "id") end)
+|> Stream.map(fn row -> "ID: #{row["id"]}" end)
+|> Stream.take(3)
+|> Enum.to_list()
+
+IO.puts("Filtered and mapped results:")
+result |> Enum.each(&IO.puts(" #{&1}"))
+```
+
+
+
+```
+Filtered and mapped results:
+ ID: 0
+ ID: 1
+ ID: 2
+```
+
+
+
+```
+:ok
+```
+
+**Streaming from HuggingFace:**
+
+```elixir
+{:ok, hf_stream} = ElixirDatasets.load_dataset(
+ {:hf, "aaaaa32r/elixirDatasets"},
+ streaming: true,
+ batch_size: 5
+)
+
+IO.puts("Streaming from HuggingFace...")
+sample = hf_stream |> Enum.take(3)
+IO.puts("β Fetched #{length(sample)} rows from HuggingFace")
+```
+
+
+
+```
+Streaming from HuggingFace...
+β Fetched 3 rows from HuggingFace
+```
+
+
+
+```
+:ok
+```
+
+#### Parallel processing with num_proc
+
+Use `num_proc` to load multiple files in parallel for faster processing. **This is most efficient for datasets with many files.**
+
+```elixir
+{:ok, data} = ElixirDatasets.load_dataset(
+ {:hf, "glue", subdir: "mrpc"},
+ num_proc: 4
+)
+
+IO.puts("Loaded #{length(data)} splits")
+```
+
+
+
+```
+|=============================================================| 100% (308.44 KB)
+|=============================================================| 100% (649.28 KB)
+|==============================================================| 100% (75.67 KB)
+Loaded 3 splits
+```
+
+
+
+```
+:ok
+```
+
+#### Real-world use case - Training/Validation split
+
+A typical ML workflow loading separate train and validation sets:
+
+```elixir
+{:ok, [train_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train"
+)
+
+{:ok, [val_df]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "validation"
+)
+
+IO.puts("Training examples: #{Explorer.DataFrame.n_rows(train_df)}")
+IO.puts("Validation examples: #{Explorer.DataFrame.n_rows(val_df)}")
+```
+
+
+
+```
+Training examples: 8530
+Validation examples: 1066
+```
+
+
+
+```
+:ok
+```
+
+#### Force redownload with download_mode
+
+Force a fresh download even if the dataset is already cached.
+
+```elixir
+{:ok, [fresh_data]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train",
+ download_mode: :force_redownload
+)
+
+IO.puts("Freshly downloaded dataset has #{Explorer.DataFrame.n_rows(fresh_data)} rows")
+```
+
+
+
+```
+|=============================================================| 100% (698.84 KB)
+Freshly downloaded dataset has 8530 rows
+```
+
+
+
+```
+:ok
+```
+
+Available `download_mode` options:
+
+* `:reuse_dataset_if_exists` (default) - Use cached data if available
+* `:force_redownload` - Always download fresh, even if cached
+
+#### Skip verification with verification_mode
+
+Skip validation checks for faster loading when you trust the data source.
+
+```elixir
+{:ok, [quick_data]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "validation",
+ verification_mode: :no_checks
+)
+
+IO.puts("Loaded #{Explorer.DataFrame.n_rows(quick_data)} rows (skipping verification)")
+```
+
+
+
+```
+Loaded 1066 rows (skipping verification)
+```
+
+
+
+```
+:ok
+```
+
+Available `verification_mode` options:
+
+* `:basic_checks` (default) - Basic validation including file existence
+* `:no_checks` - Skip all validation for faster loading
+
+#### Combining multiple advanced options
+
+Combine data loading options with Hub options for maximum control:
+
+```elixir
+{:ok, stream} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "test",
+ streaming: true,
+ download_mode: :force_redownload,
+ verification_mode: :no_checks
+)
+
+IO.puts("β Created stream in streaming mode")
+IO.puts(" Stream type: #{inspect(is_function(stream, 2))}")
+
+IO.puts("\nFetching first 3 rows...")
+sample_rows = stream |> Enum.take(3)
+IO.puts("β Fetched #{length(sample_rows)} rows")
+sample_rows |> Enum.with_index(1) |> Enum.each(fn {row, idx} ->
+ keys = Map.keys(row) |> Enum.join(", ")
+ IO.puts(" Row #{idx}: [#{keys}]")
+end)
+```
+
+
+
+```
+β Created stream in streaming mode
+ Stream type: true
+
+Fetching first 3 rows...
+β Fetched 3 rows
+ Row 1: [label, text]
+ Row 2: [label, text]
+ Row 3: [label, text]
+```
+
+
+
+```
+:ok
+```
+
+#### Using custom cache directory
+
+Control where downloaded files are stored:
+
+```elixir
+custom_cache = "/tmp/my_datasets_cache"
+
+{:ok, [cached_data]} = ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train",
+ cache_dir: custom_cache
+)
+
+IO.puts("Dataset cached in: #{custom_cache}")
+IO.puts("Loaded #{Explorer.DataFrame.n_rows(cached_data)} rows")
+```
+
+
+
+```
+Dataset cached in: /tmp/my_datasets_cache
+Loaded 8530 rows
+```
+
+
+
+```
+:ok
+```
+
+#### Offline mode
+
+Work with cached datasets without network access:
+
+```elixir
+case ElixirDatasets.load_dataset(
+ {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ split: "train",
+ offline: true
+) do
+ {:ok, [offline_data]} ->
+ IO.puts("β Loaded from cache: #{Explorer.DataFrame.n_rows(offline_data)} rows")
+
+ {:error, reason} ->
+ IO.puts("β Not in cache: #{reason}")
+end
+```
+
+
+
+```
+β Loaded from cache: 8530 rows
+```
+
+
+
+```
+:ok
+```
+
+## Upload dataset
+
+Upload your own datasets to Hugging Face Hub.
+
+### Prepare datasets to upload
+
+```elixir
+[df_head | _df_tail] = ElixirDatasets.load_dataset!({:local, "#{__DIR__}/../resources"})
+IO.puts("Prepared dataset with #{Explorer.DataFrame.n_rows(df_head)} rows for upload")
+:ok
+```
+
+
+
+```
+Prepared dataset with 11 rows for upload
+```
+
+
+
+```
+:ok
+```
+
+### Upload dataset to huggingface hub
+
+Upload a DataFrame as a dataset file (CSV, Parquet, or JSONL).
+
+```elixir
+# Uncomment to upload (requires HF_TOKEN)
+# ElixirDatasets.upload_dataset(
+# df_head,
+# "username/dataset-name",
+# file_extension: "csv"
+# )
+IO.puts("Upload example (commented out)")
+```
+
+
+
+```
+Upload example (commented out)
+```
+
+
+
+```
+:ok
+```
+
+### Delete dataset file from huggingface hub
+
+Remove a specific file from your dataset repository.
+
+```elixir
+# Uncomment to delete a file (requires HF_TOKEN)
+# ElixirDatasets.Utils.Uploader.delete_file_from_dataset(
+# "username/dataset-name",
+# "file-id-to-delete"
+# )
+IO.puts("Delete example (commented out)")
+```
+
+
+
+```
+Delete example (commented out)
+```
+
+
+
+```
+:ok
+```
+
+### Upload dataset to huggingface hub via lfs
+
+Upload large files using Git LFS for better performance.
+
+```elixir
+# Uncomment to upload via LFS (requires HF_TOKEN and git-lfs)
+# ElixirDatasets.Utils.Uploader.upload_file_via_lfs(
+# "/path/to/large-file.csv.zip",
+# "username/dataset-name"
+# )
+IO.puts("LFS upload example (commented out)")
+```
+
+
+
+```
+LFS upload example (commented out)
+```
+
+
+
+```
+:ok
+```
+
+```elixir
+{:ok, infos} = ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes")
+IO.puts("Retrieved #{length(infos)} dataset configuration(s)")
+infos
+```
+
+
+
+```
+Retrieved 1 dataset configuration(s)
+```
+
+
+
+```
+[
+ %ElixirDatasets.DatasetInfo{
+ config_name: nil,
+ features: [
+ %{"dtype" => "string", "name" => "text"},
+ %{
+ "dtype" => %{"class_label" => %{"names" => %{"0" => "neg", "1" => "pos"}}},
+ "name" => "label"
+ }
+ ],
+ splits: [
+ %{"name" => "train", "num_bytes" => 1074810, "num_examples" => 8530},
+ %{"name" => "validation", "num_bytes" => 134679, "num_examples" => 1066},
+ %{"name" => "test", "num_bytes" => 135972, "num_examples" => 1066}
+ ],
+ description: nil,
+ homepage: nil,
+ license: nil,
+ citation: nil
+ }
+]
+```
+
+### Get dataset split names
+
+Get available splits (train, validation, test) for a dataset.
+
+```elixir
+{:ok, splits} = ElixirDatasets.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")
+IO.puts("Available splits: #{Enum.join(splits, ", ")}")
+splits
+```
+
+
+
+```
+Available splits: train, validation, test
+```
+
+
+
+```
+["train", "validation", "test"]
+```
+
+### Get dataset config names
+
+Get available configurations for datasets with multiple configs.
+
+```elixir
+{:ok, configs} = ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets")
+IO.puts("Available configs: #{Enum.join(configs, ", ")}")
+configs
+```
+
+
+
+```
+Available configs: csv, default
+```
+
+
+
+```
+["csv", "default"]
+```
+
+### Write-to-file & read-from-file datasetInfo
+
+Save and load dataset metadata to/from disk for offline use.
+
+```elixir
+{:ok, dataset_info} = ElixirDatasets.get_dataset_infos("aaaaa32r/elixirDatasets")
+ElixirDatasets.DatasetInfo.write_to_directory(dataset_info, "my-dir")
+IO.puts("Saved dataset info to 'my-dir'")
+
+{:ok, loaded_info} = ElixirDatasets.DatasetInfo.from_directory("my-dir")
+IO.puts("Loaded #{length(loaded_info)} dataset info(s) from disk")
+loaded_info
+```
+
+
+
+```
+Saved dataset info to 'my-dir'
+Loaded 2 dataset info(s) from disk
+```
+
+
+
+```
+[
+ %ElixirDatasets.DatasetInfo{
+ config_name: "csv",
+ features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}],
+ splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}],
+ description: nil,
+ homepage: nil,
+ license: nil,
+ citation: nil
+ },
+ %ElixirDatasets.DatasetInfo{
+ config_name: "default",
+ features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}],
+ splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}],
+ description: nil,
+ homepage: nil,
+ license: nil,
+ citation: nil
+ }
+]
+```
diff --git a/lib/elixir_datasets.ex b/lib/elixir_datasets.ex
index 6314fe1..e673a33 100644
--- a/lib/elixir_datasets.ex
+++ b/lib/elixir_datasets.ex
@@ -4,12 +4,41 @@
defmodule ElixirDatasets do
@moduledoc """
- Todo: Add documentation for ElixirDatasets.
+ ElixirDatasets is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir.
+
+ This module provides the main public API for loading datasets, fetching metadata,
+ and uploading datasets to Hugging Face Hub.
+
+ ## Main Functions
+
+ * `load_dataset/2` - Load datasets from Hugging Face or local files
+ * `load_dataset!/2` - Same as `load_dataset/2` but raises on error
+ * `get_dataset_info/2` - Fetch dataset metadata
+ * `get_dataset_infos/2` - Fetch all dataset configurations
+ * `get_dataset_split_names/2` - Get available splits (train/test/validation)
+ * `get_dataset_config_names/2` - Get available configurations
+ * `upload_dataset/3` - Upload a dataset to Hugging Face Hub
+ * `cache_dir/0` - Get the cache directory path
+
+ ## Examples
+
+ # Load a dataset from Hugging Face
+ iex> {:ok, datasets} = ElixirDatasets.load_dataset({:hf, "imdb"})
+
+ # Load with specific split
+ iex> {:ok, train_data} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train")
+
+ # Stream large datasets
+ iex> {:ok, stream} = ElixirDatasets.load_dataset({:hf, "c4"}, streaming: true)
+ iex> stream |> Enum.take(100)
+
+ # Get dataset information
+ iex> {:ok, info} = ElixirDatasets.get_dataset_info("imdb")
+
"""
@compile if Mix.env() == :test, do: :export_all
- alias ElixirDatasets.HuggingFace
- alias ElixirDatasets.DatasetInfo
- @valid_extensions_list ["jsonl", "csv", "parquet"]
+
+ alias ElixirDatasets.{Info, Loader, Repository}
@typedoc """
A location to fetch dataset files from.
@@ -22,64 +51,15 @@ defmodule ElixirDatasets do
* `{:local, path}` - a local directory or file path containing the datasets
"""
- @type t_repository :: {:hf, String.t()} | {:hf, String.t(), keyword()} | {:local, Path.t()}
-
- defp do_load_spec(repository, repo_files, num_proc) do
- files_to_download =
- Enum.filter(repo_files, fn {file_name, _etag} ->
- extension = file_name |> Path.extname() |> String.trim_leading(".")
- extension in @valid_extensions_list
- end)
-
- if num_proc > 1 do
- files_to_download
- |> Task.async_stream(
- fn {file_name, etag} ->
- extension = file_name |> Path.extname() |> String.trim_leading(".")
-
- case download(repository, file_name, etag) do
- {:ok, path} -> {:ok, {path, extension}}
- {:error, reason} -> {:error, "failed to download #{file_name}: #{reason}"}
- end
- end,
- max_concurrency: num_proc,
- ordered: true
- )
- |> Enum.reduce_while({:ok, []}, fn
- {:ok, {:ok, path_ext}}, {:ok, acc} ->
- {:cont, {:ok, [path_ext | acc]}}
-
- {:ok, {:error, reason}}, _acc ->
- {:halt, {:error, reason}}
-
- {:exit, reason}, _acc ->
- {:halt, {:error, "task failed: #{inspect(reason)}"}}
- end)
- |> case do
- {:ok, paths} -> {:ok, Enum.reverse(paths)}
- error -> error
- end
- else
- Enum.reduce_while(files_to_download, [], fn {file_name, etag}, acc ->
- extension = file_name |> Path.extname() |> String.trim_leading(".")
-
- case download(repository, file_name, etag) do
- {:ok, path} ->
- {:cont, [{path, extension} | acc]}
-
- {:error, reason} ->
- {:halt,
- {:error, "failed to download #{file_name} from #{inspect(repository)}: #{reason}"}}
- end
- end)
- |> case do
- {:error, _} = error -> error
- paths -> {:ok, Enum.reverse(paths)}
- end
- end
+ @type t_repository :: Repository.t_repository()
+
+ # Delegated to Loader module for backward compatibility with tests
+ def do_load_spec(repository, repo_files, num_proc) do
+ Loader.load_spec(repository, repo_files, num_proc)
end
- defp decode_config(path) do
+ # Delegated to Repository module for backward compatibility with tests
+ def decode_config(path) do
path
|> File.read!()
|> Jason.decode()
@@ -96,6 +76,8 @@ defmodule ElixirDatasets do
@doc """
Fetches dataset information from the Hugging Face API.
+ Delegates to `ElixirDatasets.Info.get_dataset_info/2`.
+
## Parameters
* `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets")
@@ -108,24 +90,13 @@ defmodule ElixirDatasets do
or `{:error, reason}` if the request fails.
"""
@spec get_dataset_info(String.t(), keyword()) :: {:ok, map()} | {:error, String.t()}
- def get_dataset_info(repository_id, opts \\ []) when is_binary(repository_id) do
- url = HuggingFace.Hub.dataset_info_url(repository_id)
-
- headers =
- case HuggingFace.Hub.get_auth_token(opts) do
- {:ok, auth_token} -> [{"Authorization", "Bearer #{auth_token}"}]
- {:error, _} -> []
- end
-
- with {:ok, response} <- ElixirDatasets.Utils.HTTP.request(:get, url, headers: headers),
- {:ok, data} <- Jason.decode(response.body) do
- {:ok, data}
- end
- end
+ defdelegate get_dataset_info(repository_id, opts \\ []), to: Info
@doc """
Fetches dataset information from the Hugging Face API and returns a list of DatasetInfo structs.
+ Delegates to `ElixirDatasets.Info.get_dataset_infos/2`.
+
This function retrieves all available dataset configurations for a given repository.
## Parameters
@@ -146,21 +117,14 @@ defmodule ElixirDatasets do
["csv", "default"]
"""
@spec get_dataset_infos(String.t(), keyword()) ::
- {:ok, [DatasetInfo.t()]} | {:error, String.t()}
- def get_dataset_infos(repository_id, opts \\ []) when is_binary(repository_id) do
- case get_dataset_info(repository_id, opts) do
- {:ok, info} ->
- dataset_infos = parse_dataset_infos(info)
- {:ok, dataset_infos}
-
- {:error, reason} ->
- {:error, reason}
- end
- end
+ {:ok, [ElixirDatasets.DatasetInfo.t()]} | {:error, String.t()}
+ defdelegate get_dataset_infos(repository_id, opts \\ []), to: Info
@doc """
Parses raw dataset info map into a list of DatasetInfo structs.
+ Delegates to `ElixirDatasets.Info.parse_dataset_infos/1`.
+
Extracts the dataset_info array from the HuggingFace API response's cardData field
and converts each entry into a DatasetInfo struct.
@@ -172,20 +136,14 @@ defmodule ElixirDatasets do
A list of DatasetInfo structs.
"""
- @spec parse_dataset_infos(map()) :: [DatasetInfo.t()]
- def parse_dataset_infos(data) when is_map(data) do
- data
- |> Map.get("cardData", %{})
- |> Map.get("dataset_info", [])
- |> case do
- list when is_list(list) -> Enum.map(list, &DatasetInfo.from_map/1)
- single -> [DatasetInfo.from_map(single)]
- end
- end
+ @spec parse_dataset_infos(map()) :: [ElixirDatasets.DatasetInfo.t()]
+ defdelegate parse_dataset_infos(data), to: Info
@doc """
Gets the split names (e.g., 'train', 'test', 'validation') for a dataset.
+ Delegates to `ElixirDatasets.Info.get_dataset_split_names/2`.
+
## Parameters
* `repository_id` - the Hugging Face dataset repository ID (e.g., "cornell-movie-review-data/rotten_tomatoes")
@@ -205,29 +163,13 @@ defmodule ElixirDatasets do
"""
@spec get_dataset_split_names(String.t(), keyword()) ::
{:ok, [String.t()]} | {:error, String.t()}
- def get_dataset_split_names(repository_id, opts \\ []) when is_binary(repository_id) do
- case get_dataset_infos(repository_id, opts) do
- {:ok, infos} ->
- split_names =
- infos
- |> Enum.flat_map(fn info ->
- case info.splits do
- nil -> []
- splits -> Enum.map(splits, fn split -> split["name"] end)
- end
- end)
- |> Enum.uniq()
-
- {:ok, split_names}
-
- {:error, reason} ->
- {:error, reason}
- end
- end
+ defdelegate get_dataset_split_names(repository_id, opts \\ []), to: Info
@doc """
Gets the configuration names available for a dataset.
+ Delegates to `ElixirDatasets.Info.get_dataset_config_names/2`.
+
## Parameters
* `repository_id` - the Hugging Face dataset repository ID (e.g., "glue")
@@ -247,20 +189,13 @@ defmodule ElixirDatasets do
"""
@spec get_dataset_config_names(String.t(), keyword()) ::
{:ok, [String.t()]} | {:error, String.t()}
- def get_dataset_config_names(repository_id, opts \\ []) when is_binary(repository_id) do
- case get_dataset_infos(repository_id, opts) do
- {:ok, infos} ->
- config_names = Enum.map(infos, fn info -> info.config_name end)
- {:ok, config_names}
-
- {:error, reason} ->
- {:error, reason}
- end
- end
+ defdelegate get_dataset_config_names(repository_id, opts \\ []), to: Info
@doc """
Loads a dataset from the given repository.
+ Delegates to `ElixirDatasets.Loader.load_dataset/2`.
+
The repository can be either a local directory or a Hugging Face repository.
## Options
@@ -315,51 +250,29 @@ defmodule ElixirDatasets do
## Examples
- # Load only the training split
- ElixirDatasets.load_dataset({:hf, "dataset_name"}, split: "train")
-
- # Load a specific configuration
- ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2")
+ iex> ElixirDatasets.load_dataset({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train")
- # Load a specific split of a specific configuration
- ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2", split: "train")
+ iex> ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2")
- # Stream data progressively without downloading
- {:ok, stream} = ElixirDatasets.load_dataset(
- {:hf, "large_dataset"},
- split: "train",
- streaming: true
- )
+ iex> ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2", split: "train")
+ iex> {:ok, stream} = ElixirDatasets.load_dataset(
+ ...> {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ ...> split: "train",
+ ...> streaming: true
+ ...> )
- # Process first 100 rows without downloading entire dataset
- stream |> Stream.take(100) |> Enum.each(&process_row/1)
+ ...> stream |> Stream.take(3) |> IO.inspect()
"""
@spec load_dataset(t_repository(), keyword()) ::
{:ok, [Explorer.DataFrame.t()] | Enumerable.t()} | {:error, Exception.t()}
- def load_dataset(repository, opts \\ []) do
- repository = normalize_repository!(repository)
- split = opts[:split]
- name = opts[:name]
- streaming = opts[:streaming] || false
- num_proc = opts[:num_proc] || 1
-
- with {:ok, repo_files} <- get_repo_files(repository),
- {:ok, filtered_files} <- filter_files_by_config_and_split(repo_files, name, split) do
- if streaming do
- {:ok, build_streaming_dataset(repository, filtered_files, opts)}
- else
- with {:ok, paths_with_extensions} <-
- maybe_load_model_spec(opts, repository, filtered_files) do
- ElixirDatasets.Utils.Loader.load_datasets_from_paths(paths_with_extensions, num_proc)
- end
- end
- end
- end
+ defdelegate load_dataset(repository, opts \\ []), to: Loader
@doc """
Similar to `load_dataset/2` but raises an error if loading fails.
+ Delegates to `ElixirDatasets.Loader.load_dataset!/2`.
+
Accepts the same options as `load_dataset/2`:
* `:split` - which split to load (e.g., "train", "test", "validation")
* `:name` - dataset configuration name
@@ -372,195 +285,33 @@ defmodule ElixirDatasets do
## Examples
- # Load only training data
- datasets = ElixirDatasets.load_dataset!({:hf, "dataset_name"}, split: "train")
+ iex> datasets = ElixirDatasets.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train")
- # Stream data progressively
- stream = ElixirDatasets.load_dataset!({:hf, "dataset"}, streaming: true)
- stream |> Enum.take(10)
+ iex> stream = ElixirDatasets.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, streaming: true)
+ iex> stream |> Enum.take(10)
"""
@spec load_dataset!(t_repository(), keyword()) ::
[Explorer.DataFrame.t()] | Enumerable.t()
- def load_dataset!(repository, opts \\ []) do
- case load_dataset(repository, opts) do
- {:ok, datasets} -> datasets
- {:error, reason} -> raise reason
- end
- end
-
- @spec upload_dataset(Explorer.DataFrame.t(), String.t(), keyword()) ::
- {:error, String.t()} | {:ok, binary()}
- def upload_dataset(df, repository, file_extension) do
- ElixirDatasets.Utils.Uploader.upload_dataset(df, repository, file_extension)
- end
-
- defp filter_files_by_config_and_split(repo_files, name, split) do
- filtered =
- repo_files
- |> filter_by_config_name(name)
- |> filter_by_split(split)
-
- {:ok, filtered}
- end
-
- defp filter_by_config_name(repo_files, nil), do: repo_files
-
- defp filter_by_config_name(repo_files, config_name) do
- filtered =
- Enum.filter(repo_files, fn {file_name, _etag} ->
- String.contains?(file_name, config_name)
- end)
+ defdelegate load_dataset!(repository, opts \\ []), to: Loader
- if is_map(repo_files) do
- Map.new(filtered)
- else
- filtered
- end
- end
-
- defp filter_by_split(repo_files, nil), do: repo_files
-
- defp filter_by_split(repo_files, split) when is_binary(split) do
- filtered =
- Enum.filter(repo_files, fn {file_name, _etag} ->
- base_name = Path.basename(file_name, Path.extname(file_name))
- String.contains?(base_name, split)
- end)
-
- if is_map(repo_files) do
- Map.new(filtered)
- else
- filtered
- end
- end
-
- defp maybe_load_model_spec(opts, repository, repo_files) do
- num_proc = opts[:num_proc] || 1
-
- with {:ok, spec} <- do_load_spec(repository, repo_files, num_proc) do
- {:ok, spec}
- end
- end
-
- defp get_repo_files({:local, dir}) do
- case File.ls(dir) do
- {:ok, filenames} ->
- repo_files =
- for filename <- filenames,
- path = Path.join(dir, filename),
- File.regular?(path),
- into: %{},
- do: {filename, nil}
-
- {:ok, repo_files}
-
- {:error, reason} ->
- {:error, "could not read #{dir}, reason: #{:file.format_error(reason)}"}
- end
- end
-
- defp get_repo_files({:hf, repository_id, opts}) do
- subdir = opts[:subdir]
- url = HuggingFace.Hub.file_listing_url(repository_id, subdir, opts[:revision])
- cache_scope = repository_id_to_cache_scope(repository_id)
-
- passthrough_opts = [
- :cache_dir,
- :offline,
- :auth_token,
- :etag,
- :download_mode,
- :verification_mode
- ]
-
- result =
- HuggingFace.Hub.cached_download(
- url,
- [cache_scope: cache_scope] ++ Keyword.take(opts, passthrough_opts)
- )
-
- with {:ok, path} <- result,
- {:ok, data} <- decode_config(path) do
- repo_files =
- for entry <- data, entry["type"] == "file", into: %{} do
- path = entry["path"]
-
- name =
- if subdir do
- String.replace_leading(path, subdir <> "/", "")
- else
- path
- end
-
- etag_content = entry["lfs"]["oid"] || entry["oid"]
- etag = <", etag_content::binary, ?">>
- {name, etag}
- end
-
- {:ok, repo_files}
- end
- end
-
- defp download({:local, dir}, filename, _etag) do
- path = Path.join(dir, filename)
-
- if File.exists?(path) do
- {:ok, path}
- else
- {:error, "local file #{inspect(path)} does not exist"}
- end
- end
-
- defp download({:hf, repository_id, opts}, filename, etag) do
- filename =
- if subdir = opts[:subdir] do
- subdir <> "/" <> filename
- else
- filename
- end
-
- url = HuggingFace.Hub.file_url(repository_id, filename, opts[:revision])
- cache_scope = repository_id_to_cache_scope(repository_id)
-
- passthrough_opts = [
- :cache_dir,
- :offline,
- :auth_token,
- :download_mode,
- :verification_mode
- ]
-
- HuggingFace.Hub.cached_download(
- url,
- [etag: etag, cache_scope: cache_scope] ++
- Keyword.take(opts, passthrough_opts)
- )
- end
-
- defp repository_id_to_cache_scope(repository_id) do
- repository_id
- |> String.replace("/", "--")
- |> String.replace(~r/[^\w-]/, "")
- end
+ @doc """
+ Uploads a dataset to Hugging Face Hub.
- defp normalize_repository!({:hf, repository_id}) when is_binary(repository_id) do
- {:hf, repository_id, []}
- end
+ ## Parameters
- defp normalize_repository!({:hf, repository_id, opts}) when is_binary(repository_id) do
- opts = Keyword.validate!(opts, [:revision, :cache_dir, :offline, :auth_token, :subdir])
- {:hf, repository_id, opts}
- end
+ * `df` - Explorer.DataFrame to upload
+ * `repository` - repository ID (e.g., "username/dataset-name")
+ * `file_extension` - keyword list with file extension option
- defp normalize_repository!({:local, dir}) when is_binary(dir) do
- {:local, dir}
- end
+ ## Returns
- defp normalize_repository!(other) do
- raise ArgumentError,
- "expected repository to be either {:hf, repository_id}, {:hf, repository_id, options}" <>
- " or {:local, directory}, got: #{inspect(other)}"
+ `{:ok, response}` on success, or `{:error, reason}` on failure.
+ """
+ @spec upload_dataset(Explorer.DataFrame.t(), String.t(), keyword()) ::
+ {:error, String.t()} | {:ok, binary()}
+ def upload_dataset(df, repository, file_extension) do
+ ElixirDatasets.Utils.Uploader.upload_dataset(df, repository, file_extension)
end
@doc """
@@ -568,6 +319,15 @@ defmodule ElixirDatasets do
Defaults to the standard cache location for the given operating system.
Can be configured with the `ELIXIR_DATASETS_CACHE_DIR` environment variable.
+
+ ## Examples
+
+ iex> is_binary(ElixirDatasets.cache_dir())
+ true
+
+ iex> String.ends_with?(ElixirDatasets.cache_dir(), "elixir_datasets")
+ true
+
"""
@spec cache_dir() :: String.t()
def cache_dir() do
@@ -577,163 +337,4 @@ defmodule ElixirDatasets do
:filename.basedir(:user_cache, "elixir_datasets")
end
end
-
- defp build_streaming_dataset(repository, filtered_files, opts) do
- batch_size = opts[:batch_size] || 1000
-
- urls = build_streaming_urls(repository, filtered_files, opts)
-
- Stream.resource(
- fn -> init_streaming_state(urls, batch_size) end,
- &fetch_next_streaming_batch/1,
- &cleanup_streaming/1
- )
- end
-
- defp build_streaming_urls({:hf, repository_id, repo_opts}, filtered_files, load_opts) do
- auth_token = load_opts[:auth_token]
-
- Enum.map(filtered_files, fn {file_name, _etag} ->
- filename =
- if subdir = repo_opts[:subdir] do
- subdir <> "/" <> file_name
- else
- file_name
- end
-
- extension = file_name |> Path.extname() |> String.trim_leading(".")
- url = HuggingFace.Hub.file_url(repository_id, filename, repo_opts[:revision])
-
- {url, extension, auth_token}
- end)
- end
-
- defp build_streaming_urls({:local, dir}, filtered_files, _opts) do
- Enum.map(filtered_files, fn {file_name, _etag} ->
- path = Path.join(dir, file_name)
- extension = file_name |> Path.extname() |> String.trim_leading(".")
- {path, extension, nil}
- end)
- end
-
- defp init_streaming_state(urls, batch_size) do
- %{
- urls: urls,
- current_url_index: 0,
- current_lazy_df: nil,
- current_offset: 0,
- batch_size: batch_size,
- total_urls: length(urls)
- }
- end
-
- defp fetch_next_streaming_batch(%{current_url_index: idx, total_urls: total} = state)
- when idx >= total do
- {:halt, state}
- end
-
- defp fetch_next_streaming_batch(state) do
- case ensure_lazy_df_loaded(state) do
- {:ok, state_with_df} ->
- fetch_batch_from_lazy_df(state_with_df)
-
- {:error, _reason} ->
- new_state = %{state | current_url_index: state.current_url_index + 1, current_offset: 0}
- fetch_next_streaming_batch(new_state)
- end
- end
-
- defp ensure_lazy_df_loaded(%{current_lazy_df: nil} = state) do
- {url, extension, auth_token} = Enum.at(state.urls, state.current_url_index)
-
- case load_lazy_dataframe_from_url(url, extension, auth_token) do
- {:ok, lazy_df} ->
- {:ok, %{state | current_lazy_df: lazy_df}}
-
- {:error, reason} ->
- {:error, reason}
- end
- end
-
- defp ensure_lazy_df_loaded(state), do: {:ok, state}
-
- defp load_lazy_dataframe_from_url(url_or_path, extension, _auth_token) do
- is_url =
- String.starts_with?(url_or_path, "http://") or String.starts_with?(url_or_path, "https://")
-
- case {extension, is_url} do
- {"parquet", true} ->
- Explorer.DataFrame.from_parquet(url_or_path, lazy: true)
-
- {"parquet", false} ->
- Explorer.DataFrame.from_parquet(url_or_path, lazy: true)
-
- {"csv", false} ->
- Explorer.DataFrame.from_csv(url_or_path, lazy: true)
-
- {"jsonl", false} ->
- Explorer.DataFrame.from_ndjson(url_or_path, lazy: true)
-
- {"csv", true} ->
- case Explorer.DataFrame.from_csv(url_or_path) do
- {:ok, df} -> {:ok, df}
- error -> error
- end
-
- {"jsonl", true} ->
- case Explorer.DataFrame.from_ndjson(url_or_path) do
- {:ok, df} -> {:ok, df}
- error -> error
- end
-
- _ ->
- {:error, "Unsupported format for streaming: #{extension}"}
- end
- end
-
- defp fetch_batch_from_lazy_df(state) do
- %{current_lazy_df: df, current_offset: offset, batch_size: batch_size} = state
-
- batch_df =
- df
- |> Explorer.DataFrame.slice(offset, batch_size)
- |> then(fn sliced ->
- if Explorer.DataFrame.lazy?(sliced) do
- Explorer.DataFrame.collect(sliced)
- else
- sliced
- end
- end)
-
- batch_rows = Explorer.DataFrame.to_rows(batch_df)
- num_rows = length(batch_rows)
-
- cond do
- num_rows == 0 ->
- new_state = %{
- state
- | current_url_index: state.current_url_index + 1,
- current_lazy_df: nil,
- current_offset: 0
- }
-
- fetch_next_streaming_batch(new_state)
-
- num_rows < batch_size ->
- new_state = %{
- state
- | current_url_index: state.current_url_index + 1,
- current_lazy_df: nil,
- current_offset: 0
- }
-
- {batch_rows, new_state}
-
- true ->
- new_state = %{state | current_offset: offset + batch_size}
- {batch_rows, new_state}
- end
- end
-
- defp cleanup_streaming(_state), do: :ok
end
diff --git a/lib/elixir_datasets/dataset_loader.ex b/lib/elixir_datasets/dataset_loader.ex
new file mode 100644
index 0000000..934c33a
--- /dev/null
+++ b/lib/elixir_datasets/dataset_loader.ex
@@ -0,0 +1,228 @@
+defmodule ElixirDatasets.Loader do
+ @moduledoc """
+ Functions for loading datasets from repositories.
+ """
+
+ alias ElixirDatasets.Repository
+ alias ElixirDatasets.Filter
+ alias ElixirDatasets.Streaming
+
+ @valid_extensions_list ["jsonl", "csv", "parquet"]
+
+ @doc """
+ Loads a dataset from the given repository.
+
+ The repository can be either a local directory or a Hugging Face repository.
+
+ ## Options
+
+ ### Data Loading Options
+
+ * `:split` - which split of the data to load (e.g., "train", "test", "validation").
+ If not specified, all splits are loaded. Files are matched by name patterns
+ (e.g., "train.csv", "test-00000.parquet", "validation.jsonl").
+
+ * `:name` - the name of the dataset configuration to load. For datasets with
+ multiple configurations, this specifies which one to use. Files are matched
+ by looking for the config name in the file path (e.g., "sst2/train.parquet").
+
+ * `:streaming` - if `true`, returns an enumerable that progressively yields
+ data rows (maps) without loading the entire dataset into memory. Data is
+ fetched on-demand as you iterate. Useful for large datasets. Default is `false`.
+
+ ### HuggingFace Hub Options
+
+ * `:auth_token` - the token to use as HTTP bearer authorization
+ for remote files. If not provided, the token from the
+ `ELIXIR_DATASETS_HF_TOKEN` environment variable is used.
+
+ * `:cache_dir` - the directory to store downloaded files in.
+ Defaults to the standard cache location for the operating system.
+
+ * `:offline` - if `true`, only cached files are used and no network
+ requests are made. Returns an error if the file is not cached.
+
+ * `:etag` - if provided, skips the HEAD request to fetch the latest
+ ETag value and uses this value instead.
+
+ * `:download_mode` - controls download/cache behavior. Can be:
+ - `:reuse_dataset_if_exists` (default) - reuse cached data if available
+ - `:force_redownload` - always download, even if cached
+
+ * `:verification_mode` - controls verification checks. Can be:
+ - `:basic_checks` (default) - basic validation
+ - `:no_checks` - skip all validation
+
+ * `:num_proc` - number of processes to use for parallel dataset processing.
+ Default is `1` (no parallelization). Set to a higher number to speed up
+ dataset downloading and loading. For example, `num_proc: 4` will use 4
+ parallel processes.
+
+ ## Returns
+
+ - When `streaming: false` (default): `{:ok, datasets}` where `datasets` is a list of Explorer.DataFrame.t()
+ - When `streaming: true`: `{:ok, stream}` where `stream` is an Enumerable that yields rows progressively
+ - On error: `{:error, reason}`
+
+ ## Examples
+
+ iex> ElixirDatasets.Loader.load_dataset({:hf, "dataset_name"}, split: "train")
+
+ iex> ElixirDatasets.Loader.load_dataset({:hf, "glue"}, name: "sst2")
+
+ iex> {:ok, stream} = ElixirDatasets.Loader.load_dataset(
+ ...> {:hf, "cornell-movie-review-data/rotten_tomatoes"},
+ ...> split: "train",
+ ...> streaming: true
+ ...> )
+ ...> stream |> Stream.take(100) |> IO.inspect()
+
+ """
+ @spec load_dataset(Repository.t_repository(), keyword()) ::
+ {:ok, [Explorer.DataFrame.t()] | Enumerable.t()} | {:error, Exception.t()}
+ def load_dataset(repository, opts \\ []) do
+ repository = Repository.normalize!(repository)
+ split = opts[:split]
+ name = opts[:name]
+ streaming = opts[:streaming] || false
+ num_proc = opts[:num_proc] || 1
+
+ repository = merge_download_opts(repository, opts)
+
+ with {:ok, repo_files} <- Repository.get_files(repository),
+ {:ok, filtered_files} <- Filter.by_config_and_split(repo_files, name, split) do
+ if streaming do
+ {:ok, Streaming.build(repository, filtered_files, opts)}
+ else
+ with {:ok, paths_with_extensions} <- load_spec(repository, filtered_files, num_proc) do
+ ElixirDatasets.Utils.Loader.load_datasets_from_paths(paths_with_extensions, num_proc)
+ end
+ end
+ end
+ end
+
+ @doc """
+ Similar to `load_dataset/2` but raises an error if loading fails.
+
+ Accepts the same options as `load_dataset/2`.
+
+ ## Returns
+
+ * a list of loaded datasets (or a Stream if streaming is enabled)
+ * raises an error if loading fails
+
+ ## Examples
+
+ iex> datasets = ElixirDatasets.Loader.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train")
+
+ iex> stream = ElixirDatasets.Loader.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, streaming: true)
+ iex> stream |> Enum.take(10)
+
+ """
+ @spec load_dataset!(Repository.t_repository(), keyword()) ::
+ [Explorer.DataFrame.t()] | Enumerable.t()
+ def load_dataset!(repository, opts \\ []) do
+ case load_dataset(repository, opts) do
+ {:ok, datasets} -> datasets
+ {:error, reason} -> raise reason
+ end
+ end
+
+ @doc """
+ Loads the specification of files to download from a repository.
+
+ Filters files by valid extensions and downloads them in parallel if num_proc > 1.
+
+ ## Parameters
+
+ * `repository` - normalized repository tuple
+ * `repo_files` - map of files from repository
+ * `num_proc` - number of parallel processes to use
+
+ ## Returns
+
+ `{:ok, paths_with_extensions}` where each element is `{path, extension}`,
+ or `{:error, reason}` if download fails.
+ """
+ @spec load_spec(tuple(), map(), pos_integer()) ::
+ {:ok, list({String.t(), String.t()})} | {:error, String.t()}
+ def load_spec(repository, repo_files, num_proc) do
+ files_to_download =
+ Enum.filter(repo_files, fn {file_name, _etag} ->
+ extension = file_name |> Path.extname() |> String.trim_leading(".")
+ extension in @valid_extensions_list
+ end)
+
+ if num_proc > 1 do
+ download_parallel(repository, files_to_download, num_proc)
+ else
+ download_sequential(repository, files_to_download)
+ end
+ end
+
+ defp download_parallel(repository, files_to_download, num_proc) do
+ files_to_download
+ |> Task.async_stream(
+ fn {file_name, etag} ->
+ extension = file_name |> Path.extname() |> String.trim_leading(".")
+
+ case Repository.download(repository, file_name, etag) do
+ {:ok, path} -> {:ok, {path, extension}}
+ {:error, reason} -> {:error, "failed to download #{file_name}: #{reason}"}
+ end
+ end,
+ max_concurrency: num_proc,
+ ordered: true,
+ timeout: :infinity
+ )
+ |> Enum.reduce_while({:ok, []}, fn
+ {:ok, {:ok, path_ext}}, {:ok, acc} ->
+ {:cont, {:ok, [path_ext | acc]}}
+
+ {:ok, {:error, reason}}, _acc ->
+ {:halt, {:error, reason}}
+
+ {:exit, reason}, _acc ->
+ {:halt, {:error, "task failed: #{inspect(reason)}"}}
+ end)
+ |> case do
+ {:ok, paths} -> {:ok, Enum.reverse(paths)}
+ error -> error
+ end
+ end
+
+ defp download_sequential(repository, files_to_download) do
+ Enum.reduce_while(files_to_download, [], fn {file_name, etag}, acc ->
+ extension = file_name |> Path.extname() |> String.trim_leading(".")
+
+ case Repository.download(repository, file_name, etag) do
+ {:ok, path} ->
+ {:cont, [{path, extension} | acc]}
+
+ {:error, reason} ->
+ {:halt,
+ {:error, "failed to download #{file_name} from #{inspect(repository)}: #{reason}"}}
+ end
+ end)
+ |> case do
+ {:error, _} = error -> error
+ paths -> {:ok, Enum.reverse(paths)}
+ end
+ end
+
+ defp merge_download_opts({:hf, repository_id, repo_opts}, load_opts) do
+ download_opts = [
+ :download_mode,
+ :verification_mode,
+ :cache_dir,
+ :auth_token,
+ :offline,
+ :revision
+ ]
+
+ merged_opts = Keyword.merge(repo_opts, Keyword.take(load_opts, download_opts))
+ {:hf, repository_id, merged_opts}
+ end
+
+ defp merge_download_opts(repository, _load_opts), do: repository
+end
diff --git a/lib/elixir_datasets/filter.ex b/lib/elixir_datasets/filter.ex
new file mode 100644
index 0000000..e34af09
--- /dev/null
+++ b/lib/elixir_datasets/filter.ex
@@ -0,0 +1,103 @@
+defmodule ElixirDatasets.Filter do
+ @moduledoc """
+ Functions for filtering dataset files by configuration and split.
+ """
+
+ @doc """
+ Filters repository files by configuration name and split.
+
+ ## Parameters
+
+ * `repo_files` - map of files from repository (%{filename => etag})
+ * `name` - optional configuration name to filter by
+ * `split` - optional split name to filter by (e.g., "train", "test")
+
+ ## Returns
+
+ `{:ok, filtered_files}` where `filtered_files` is a map of matching files.
+
+ ## Examples
+
+ iex> files = %{"train.csv" => nil, "test.csv" => nil}
+ iex> ElixirDatasets.Filter.by_config_and_split(files, nil, "train")
+ {:ok, %{"train.csv" => nil}}
+ """
+ @spec by_config_and_split(map(), String.t() | nil, String.t() | nil) :: {:ok, map()}
+ def by_config_and_split(repo_files, name, split) do
+ filtered =
+ repo_files
+ |> by_config_name(name)
+ |> by_split(split)
+
+ {:ok, filtered}
+ end
+
+ @doc """
+ Filters files by configuration name.
+
+ If `config_name` is nil, returns all files unchanged.
+ Otherwise, returns only files whose path contains the config name.
+
+ ## Parameters
+
+ * `repo_files` - map or list of files
+ * `config_name` - optional configuration name to filter by
+
+ ## Returns
+
+ Filtered files in the same format as input (map or list).
+ """
+ @spec by_config_name(map() | list(), String.t() | nil) :: map() | list()
+ def by_config_name(repo_files, nil), do: repo_files
+
+ def by_config_name(repo_files, config_name) do
+ filtered =
+ Enum.filter(repo_files, fn {file_name, _etag} ->
+ String.contains?(file_name, config_name)
+ end)
+
+ if is_map(repo_files) do
+ Map.new(filtered)
+ else
+ filtered
+ end
+ end
+
+ @doc """
+ Filters files by split name.
+
+ If `split` is nil, returns all files unchanged.
+ Otherwise, returns only files whose basename (without extension) contains the split name.
+
+ ## Parameters
+
+ * `repo_files` - map or list of files
+ * `split` - optional split name to filter by (e.g., "train", "test", "validation")
+
+ ## Returns
+
+ Filtered files in the same format as input (map or list).
+
+ ## Examples
+
+ iex> files = %{"train.csv" => nil, "test.csv" => nil, "validation.csv" => nil}
+ iex> ElixirDatasets.Filter.by_split(files, "train")
+ %{"train.csv" => nil}
+ """
+ @spec by_split(map() | list(), String.t() | nil) :: map() | list()
+ def by_split(repo_files, nil), do: repo_files
+
+ def by_split(repo_files, split) when is_binary(split) do
+ filtered =
+ Enum.filter(repo_files, fn {file_name, _etag} ->
+ base_name = Path.basename(file_name, Path.extname(file_name))
+ String.contains?(base_name, split)
+ end)
+
+ if is_map(repo_files) do
+ Map.new(filtered)
+ else
+ filtered
+ end
+ end
+end
diff --git a/lib/huggingface/hub.ex b/lib/elixir_datasets/huggingface/hub.ex
similarity index 99%
rename from lib/huggingface/hub.ex
rename to lib/elixir_datasets/huggingface/hub.ex
index ff84e75..ad119f8 100644
--- a/lib/huggingface/hub.ex
+++ b/lib/elixir_datasets/huggingface/hub.ex
@@ -76,7 +76,7 @@ defmodule ElixirDatasets.HuggingFace.Hub do
"""
@spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()}
def cached_download(url, opts \\ []) do
- cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir()
+ cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir() |> Path.expand()
offline = Keyword.get(opts, :offline, elixir_datasets_offline?())
auth_token = opts[:auth_token]
download_mode = opts[:download_mode] || :reuse_dataset_if_exists
diff --git a/lib/elixir_datasets/info_getter.ex b/lib/elixir_datasets/info_getter.ex
new file mode 100644
index 0000000..57e0f9a
--- /dev/null
+++ b/lib/elixir_datasets/info_getter.ex
@@ -0,0 +1,173 @@
+defmodule ElixirDatasets.Info do
+ @moduledoc """
+ Functions for fetching and parsing dataset metadata from Hugging Face Hub.
+ """
+
+ alias ElixirDatasets.HuggingFace
+ alias ElixirDatasets.DatasetInfo
+
+ @doc """
+ Fetches dataset information from the Hugging Face API.
+
+ ## Parameters
+
+ * `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets")
+ * `opts` - optional keyword list with the following options:
+ * `:auth_token` - the token to use as HTTP bearer authorization
+
+ ## Returns
+
+ Returns `{:ok, dataset_info}` where `dataset_info` is a map containing the dataset metadata,
+ or `{:error, reason}` if the request fails.
+ """
+ @spec get_dataset_info(String.t(), keyword()) :: {:ok, map()} | {:error, String.t()}
+ def get_dataset_info(repository_id, opts \\ []) when is_binary(repository_id) do
+ url = HuggingFace.Hub.dataset_info_url(repository_id)
+
+ headers =
+ case HuggingFace.Hub.get_auth_token(opts) do
+ {:ok, auth_token} -> [{"Authorization", "Bearer #{auth_token}"}]
+ {:error, _} -> []
+ end
+
+ with {:ok, response} <- ElixirDatasets.Utils.HTTP.request(:get, url, headers: headers),
+ {:ok, data} <- Jason.decode(response.body) do
+ {:ok, data}
+ end
+ end
+
+ @doc """
+ Fetches dataset information from the Hugging Face API and returns a list of DatasetInfo structs.
+
+ This function retrieves all available dataset configurations for a given repository.
+
+ ## Parameters
+
+ * `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets")
+ * `opts` - optional keyword list with the following options:
+ * `:auth_token` - the token to use as HTTP bearer authorization
+
+ ## Returns
+
+ Returns `{:ok, dataset_infos}` where `dataset_infos` is a list of DatasetInfo structs,
+ or `{:error, reason}` if the request fails.
+
+ ## Examples
+
+ iex> {:ok, infos} = ElixirDatasets.Info.get_dataset_infos("aaaaa32r/elixirDatasets")
+ iex> Enum.map(infos, & &1.config_name)
+ ["csv", "default"]
+ """
+ @spec get_dataset_infos(String.t(), keyword()) ::
+ {:ok, [DatasetInfo.t()]} | {:error, String.t()}
+ def get_dataset_infos(repository_id, opts \\ []) when is_binary(repository_id) do
+ case get_dataset_info(repository_id, opts) do
+ {:ok, info} ->
+ dataset_infos = parse_dataset_infos(info)
+ {:ok, dataset_infos}
+
+ {:error, reason} ->
+ {:error, reason}
+ end
+ end
+
+ @doc """
+ Parses raw dataset info map into a list of DatasetInfo structs.
+
+ Extracts the dataset_info array from the HuggingFace API response's cardData field
+ and converts each entry into a DatasetInfo struct.
+
+ ## Parameters
+
+ * `data` - the raw response map from the HuggingFace API
+
+ ## Returns
+
+ A list of DatasetInfo structs.
+ """
+ @spec parse_dataset_infos(map()) :: [DatasetInfo.t()]
+ def parse_dataset_infos(data) when is_map(data) do
+ data
+ |> Map.get("cardData", %{})
+ |> Map.get("dataset_info", [])
+ |> case do
+ list when is_list(list) -> Enum.map(list, &DatasetInfo.from_map/1)
+ single -> [DatasetInfo.from_map(single)]
+ end
+ end
+
+ @doc """
+ Gets the split names (e.g., 'train', 'test', 'validation') for a dataset.
+
+ ## Parameters
+
+ * `repository_id` - the Hugging Face dataset repository ID (e.g., "cornell-movie-review-data/rotten_tomatoes")
+ * `opts` - optional keyword list with the following options:
+ * `:auth_token` - the token to use as HTTP bearer authorization
+
+ ## Returns
+
+ Returns `{:ok, split_names}` where `split_names` is a list of strings representing
+ the available splits, or `{:error, reason}` if the request fails.
+
+ ## Examples
+
+ iex> {:ok, splits} = ElixirDatasets.Info.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")
+ iex> splits
+ ["train", "validation", "test"]
+ """
+ @spec get_dataset_split_names(String.t(), keyword()) ::
+ {:ok, [String.t()]} | {:error, String.t()}
+ def get_dataset_split_names(repository_id, opts \\ []) when is_binary(repository_id) do
+ case get_dataset_infos(repository_id, opts) do
+ {:ok, infos} ->
+ split_names =
+ infos
+ |> Enum.flat_map(fn info ->
+ case info.splits do
+ nil -> []
+ splits -> Enum.map(splits, fn split -> split["name"] end)
+ end
+ end)
+ |> Enum.uniq()
+
+ {:ok, split_names}
+
+ {:error, reason} ->
+ {:error, reason}
+ end
+ end
+
+ @doc """
+ Gets the configuration names available for a dataset.
+
+ ## Parameters
+
+ * `repository_id` - the Hugging Face dataset repository ID (e.g., "glue")
+ * `opts` - optional keyword list with the following options:
+ * `:auth_token` - the token to use as HTTP bearer authorization
+
+ ## Returns
+
+ Returns `{:ok, config_names}` where `config_names` is a list of configuration names,
+ or `{:error, reason}` if the request fails.
+
+ ## Examples
+
+ iex> {:ok, configs} = ElixirDatasets.Info.get_dataset_config_names("glue")
+ iex> Enum.member?(configs, "cola")
+ true
+ """
+ @spec get_dataset_config_names(String.t(), keyword()) ::
+ {:ok, [String.t()]} | {:error, String.t()}
+ def get_dataset_config_names(repository_id, opts \\ []) when is_binary(repository_id) do
+ case get_dataset_infos(repository_id, opts) do
+ {:ok, infos} ->
+ config_names = Enum.map(infos, fn info -> info.config_name end)
+ {:ok, config_names}
+
+ {:error, reason} ->
+ {:error, reason}
+ end
+ end
+end
diff --git a/lib/elixir_datasets/repository.ex b/lib/elixir_datasets/repository.ex
new file mode 100644
index 0000000..4fbdc50
--- /dev/null
+++ b/lib/elixir_datasets/repository.ex
@@ -0,0 +1,202 @@
+defmodule ElixirDatasets.Repository do
+ @moduledoc """
+ Functions for managing dataset repositories (local and Hugging Face).
+ """
+
+ alias ElixirDatasets.HuggingFace
+
+ @typedoc """
+ A location to fetch dataset files from.
+ Can be either a Hugging Face repository or a local resource:
+
+ * `{:hf, repository_id}` - the Hugging Face repository ID
+
+ * `{:hf, repository_id, options}` - the Hugging Face repository ID
+ with additional options
+
+ * `{:local, path}` - a local directory or file path containing the datasets
+ """
+ @type t_repository :: {:hf, String.t()} | {:hf, String.t(), keyword()} | {:local, Path.t()}
+
+ @doc """
+ Normalizes repository specification to a consistent format.
+
+ ## Examples
+
+ iex> ElixirDatasets.Repository.normalize!({:hf, "repo/name"})
+ {:hf, "repo/name", []}
+
+ iex> ElixirDatasets.Repository.normalize!({:local, "/path/to/data"})
+ {:local, "/path/to/data"}
+ """
+ @spec normalize!(t_repository()) :: t_repository()
+ def normalize!({:hf, repository_id}) when is_binary(repository_id) do
+ {:hf, repository_id, []}
+ end
+
+ def normalize!({:hf, repository_id, opts}) when is_binary(repository_id) do
+ opts = Keyword.validate!(opts, [:revision, :cache_dir, :offline, :auth_token, :subdir])
+ {:hf, repository_id, opts}
+ end
+
+ def normalize!({:local, dir}) when is_binary(dir) do
+ {:local, dir}
+ end
+
+ def normalize!(other) do
+ raise ArgumentError,
+ "expected repository to be either {:hf, repository_id}, {:hf, repository_id, options}" <>
+ " or {:local, directory}, got: #{inspect(other)}"
+ end
+
+ @doc """
+ Gets the list of files in a repository.
+
+ For local repositories, lists files in the directory.
+ For Hugging Face repositories, fetches the file listing from the API.
+
+ ## Returns
+
+ `{:ok, repo_files}` where `repo_files` is a map of `%{filename => etag}`,
+ or `{:error, reason}` if the operation fails.
+ """
+ @spec get_files(t_repository()) :: {:ok, map()} | {:error, String.t()}
+ def get_files({:local, dir}) do
+ case File.ls(dir) do
+ {:ok, filenames} ->
+ repo_files =
+ for filename <- filenames,
+ path = Path.join(dir, filename),
+ File.regular?(path),
+ into: %{},
+ do: {filename, nil}
+
+ {:ok, repo_files}
+
+ {:error, reason} ->
+ {:error, "could not read #{dir}, reason: #{:file.format_error(reason)}"}
+ end
+ end
+
+ def get_files({:hf, repository_id, opts}) do
+ subdir = opts[:subdir]
+ url = HuggingFace.Hub.file_listing_url(repository_id, subdir, opts[:revision])
+ cache_scope = repository_id_to_cache_scope(repository_id)
+
+ passthrough_opts = [
+ :cache_dir,
+ :offline,
+ :auth_token,
+ :etag,
+ :download_mode,
+ :verification_mode
+ ]
+
+ result =
+ HuggingFace.Hub.cached_download(
+ url,
+ [cache_scope: cache_scope] ++ Keyword.take(opts, passthrough_opts)
+ )
+
+ with {:ok, path} <- result,
+ {:ok, data} <- decode_config(path) do
+ repo_files =
+ for entry <- data, entry["type"] == "file", into: %{} do
+ path = entry["path"]
+
+ name =
+ if subdir do
+ String.replace_leading(path, subdir <> "/", "")
+ else
+ path
+ end
+
+ etag_content = entry["lfs"]["oid"] || entry["oid"]
+ etag = <", etag_content::binary, ?">>
+ {name, etag}
+ end
+
+ {:ok, repo_files}
+ end
+ end
+
+ @doc """
+ Downloads a file from a repository.
+
+ For local repositories, verifies the file exists.
+ For Hugging Face repositories, downloads the file using the Hub API.
+
+ ## Returns
+
+ `{:ok, path}` where `path` is the local file path,
+ or `{:error, reason}` if the download fails.
+ """
+ @spec download(t_repository(), String.t(), String.t() | nil) ::
+ {:ok, String.t()} | {:error, String.t()}
+ def download({:local, dir}, filename, _etag) do
+ path = Path.join(dir, filename)
+
+ if File.exists?(path) do
+ {:ok, path}
+ else
+ {:error, "local file #{inspect(path)} does not exist"}
+ end
+ end
+
+ def download({:hf, repository_id, opts}, filename, etag) do
+ filename =
+ if subdir = opts[:subdir] do
+ subdir <> "/" <> filename
+ else
+ filename
+ end
+
+ url = HuggingFace.Hub.file_url(repository_id, filename, opts[:revision])
+ cache_scope = repository_id_to_cache_scope(repository_id)
+
+ passthrough_opts = [
+ :cache_dir,
+ :offline,
+ :auth_token,
+ :download_mode,
+ :verification_mode
+ ]
+
+ HuggingFace.Hub.cached_download(
+ url,
+ [etag: etag, cache_scope: cache_scope] ++
+ Keyword.take(opts, passthrough_opts)
+ )
+ end
+
+ @doc """
+ Converts a repository ID to a cache scope string.
+
+ Replaces slashes with double dashes and removes non-word characters.
+
+ ## Examples
+
+ iex> ElixirDatasets.Repository.repository_id_to_cache_scope("user/repo-name")
+ "user--repo-name"
+ """
+ @spec repository_id_to_cache_scope(String.t()) :: String.t()
+ def repository_id_to_cache_scope(repository_id) do
+ repository_id
+ |> String.replace("/", "--")
+ |> String.replace(~r/[^\w-]/, "")
+ end
+
+ defp decode_config(path) do
+ path
+ |> File.read!()
+ |> Jason.decode()
+ |> case do
+ {:ok, data} ->
+ {:ok, data}
+
+ {:error, reason} ->
+ {:error,
+ "failed to parse the config file, it is not a valid JSON. Reason: #{inspect(reason)}"}
+ end
+ end
+end
diff --git a/lib/elixir_datasets/streaming.ex b/lib/elixir_datasets/streaming.ex
new file mode 100644
index 0000000..e70f0a3
--- /dev/null
+++ b/lib/elixir_datasets/streaming.ex
@@ -0,0 +1,189 @@
+defmodule ElixirDatasets.Streaming do
+ @moduledoc """
+ Functions for streaming datasets progressively without loading everything into memory.
+ """
+
+ alias ElixirDatasets.HuggingFace
+
+ @doc """
+ Builds a streaming dataset that yields rows progressively.
+
+ ## Parameters
+
+ * `repository` - normalized repository tuple
+ * `filtered_files` - map of files to stream from
+ * `opts` - options including:
+ * `:batch_size` - number of rows to fetch per batch (default: 1000)
+ * `:auth_token` - authentication token for Hugging Face
+
+ ## Returns
+
+ A Stream that yields rows as maps.
+ """
+ @spec build(tuple(), map(), keyword()) :: Enumerable.t()
+ def build(repository, filtered_files, opts) do
+ batch_size = opts[:batch_size] || 1000
+
+ urls = build_urls(repository, filtered_files, opts)
+
+ Stream.resource(
+ fn -> init_state(urls, batch_size) end,
+ &fetch_next_batch/1,
+ &cleanup/1
+ )
+ end
+
+ @doc """
+ Builds URLs for streaming from repository files.
+
+ For Hugging Face repositories, creates HTTP URLs.
+ For local repositories, uses file paths.
+ """
+ @spec build_urls(tuple(), map(), keyword()) :: list()
+ def build_urls({:hf, repository_id, repo_opts}, filtered_files, load_opts) do
+ auth_token = load_opts[:auth_token]
+
+ Enum.map(filtered_files, fn {file_name, _etag} ->
+ filename =
+ if subdir = repo_opts[:subdir] do
+ subdir <> "/" <> file_name
+ else
+ file_name
+ end
+
+ extension = file_name |> Path.extname() |> String.trim_leading(".")
+ url = HuggingFace.Hub.file_url(repository_id, filename, repo_opts[:revision])
+
+ {url, extension, auth_token}
+ end)
+ end
+
+ def build_urls({:local, dir}, filtered_files, _opts) do
+ Enum.map(filtered_files, fn {file_name, _etag} ->
+ path = Path.join(dir, file_name)
+ extension = file_name |> Path.extname() |> String.trim_leading(".")
+ {path, extension, nil}
+ end)
+ end
+
+ defp init_state(urls, batch_size) do
+ %{
+ urls: urls,
+ current_url_index: 0,
+ current_lazy_df: nil,
+ current_offset: 0,
+ batch_size: batch_size,
+ total_urls: length(urls)
+ }
+ end
+
+ defp fetch_next_batch(%{current_url_index: idx, total_urls: total} = state)
+ when idx >= total do
+ {:halt, state}
+ end
+
+ defp fetch_next_batch(state) do
+ case ensure_lazy_df_loaded(state) do
+ {:ok, state_with_df} ->
+ fetch_batch_from_lazy_df(state_with_df)
+
+ {:error, _reason} ->
+ new_state = %{state | current_url_index: state.current_url_index + 1, current_offset: 0}
+ fetch_next_batch(new_state)
+ end
+ end
+
+ defp ensure_lazy_df_loaded(%{current_lazy_df: nil} = state) do
+ {url, extension, auth_token} = Enum.at(state.urls, state.current_url_index)
+
+ case load_lazy_dataframe(url, extension, auth_token) do
+ {:ok, lazy_df} ->
+ {:ok, %{state | current_lazy_df: lazy_df}}
+
+ {:error, reason} ->
+ {:error, reason}
+ end
+ end
+
+ defp ensure_lazy_df_loaded(state), do: {:ok, state}
+
+ defp load_lazy_dataframe(url_or_path, extension, _auth_token) do
+ is_url =
+ String.starts_with?(url_or_path, "http://") or String.starts_with?(url_or_path, "https://")
+
+ case {extension, is_url} do
+ {"parquet", true} ->
+ Explorer.DataFrame.from_parquet(url_or_path, lazy: true)
+
+ {"parquet", false} ->
+ Explorer.DataFrame.from_parquet(url_or_path, lazy: true)
+
+ {"csv", false} ->
+ Explorer.DataFrame.from_csv(url_or_path, lazy: true)
+
+ {"jsonl", false} ->
+ Explorer.DataFrame.from_ndjson(url_or_path, lazy: true)
+
+ {"csv", true} ->
+ case Explorer.DataFrame.from_csv(url_or_path) do
+ {:ok, df} -> {:ok, df}
+ error -> error
+ end
+
+ {"jsonl", true} ->
+ case Explorer.DataFrame.from_ndjson(url_or_path) do
+ {:ok, df} -> {:ok, df}
+ error -> error
+ end
+
+ _ ->
+ {:error, "Unsupported format for streaming: #{extension}"}
+ end
+ end
+
+ defp fetch_batch_from_lazy_df(state) do
+ %{current_lazy_df: df, current_offset: offset, batch_size: batch_size} = state
+
+ batch_df =
+ df
+ |> Explorer.DataFrame.slice(offset, batch_size)
+ |> then(fn sliced ->
+ if Explorer.DataFrame.lazy?(sliced) do
+ Explorer.DataFrame.collect(sliced)
+ else
+ sliced
+ end
+ end)
+
+ batch_rows = Explorer.DataFrame.to_rows(batch_df)
+ num_rows = length(batch_rows)
+
+ cond do
+ num_rows == 0 ->
+ new_state = %{
+ state
+ | current_url_index: state.current_url_index + 1,
+ current_lazy_df: nil,
+ current_offset: 0
+ }
+
+ fetch_next_batch(new_state)
+
+ num_rows < batch_size ->
+ new_state = %{
+ state
+ | current_url_index: state.current_url_index + 1,
+ current_lazy_df: nil,
+ current_offset: 0
+ }
+
+ {batch_rows, new_state}
+
+ true ->
+ new_state = %{state | current_offset: offset + batch_size}
+ {batch_rows, new_state}
+ end
+ end
+
+ defp cleanup(_state), do: :ok
+end
diff --git a/lib/elixir_datasets/utils/loader.ex b/lib/elixir_datasets/utils/file_loader.ex
similarity index 95%
rename from lib/elixir_datasets/utils/loader.ex
rename to lib/elixir_datasets/utils/file_loader.ex
index fcd8715..fbeaeeb 100644
--- a/lib/elixir_datasets/utils/loader.ex
+++ b/lib/elixir_datasets/utils/file_loader.ex
@@ -96,11 +96,11 @@ defmodule ElixirDatasets.Utils.Loader do
## Examples
# Sequential loading
- paths = [{"data1.csv", "csv"}, {"data2.parquet", "parquet"}]
- datasets = load_datasets_from_paths!(paths)
+ iex> paths = [{"data1.csv", "csv"}, {"data2.parquet", "parquet"}]
+ iex> datasets = load_datasets_from_paths!(paths)
# Parallel loading with 4 processes
- datasets = load_datasets_from_paths!(paths, 4)
+ iex> datasets = load_datasets_from_paths!(paths, 4)
"""
@spec load_datasets_from_paths!([{Path.t(), String.t()}], pos_integer()) :: [
diff --git a/test/elixir_datasets/dataset_loader_test.exs b/test/elixir_datasets/dataset_loader_test.exs
new file mode 100644
index 0000000..bdf97e3
--- /dev/null
+++ b/test/elixir_datasets/dataset_loader_test.exs
@@ -0,0 +1,214 @@
+defmodule ElixirDatasets.LoaderTest do
+ use ExUnit.Case, async: false
+ doctest ElixirDatasets.Loader
+
+ alias ElixirDatasets.Loader
+
+ describe "load_spec/3" do
+ @cache_dir "test_cache_load_spec"
+ @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
+ @valid_repo_files %{
+ "resources/csv-test.csv" => "\"2dccc814f47c01b5344abbb72367a5b322656b0b\""
+ }
+ @invalid_repo_files %{"invalid.csv" => "\"1234567890asdfgh\""}
+
+ test "loads valid files" do
+ assert {:ok, _paths} = Loader.load_spec(@repository, @valid_repo_files, 1)
+ File.rm_rf!(@cache_dir)
+ end
+
+ test "returns error for invalid files" do
+ assert {:error, _reason} = Loader.load_spec(@repository, @invalid_repo_files, 1)
+ File.rm_rf!(@cache_dir)
+ end
+
+ test "loads files with num_proc > 1" do
+ assert {:ok, paths} = Loader.load_spec(@repository, @valid_repo_files, 4)
+ assert is_list(paths)
+ File.rm_rf!(@cache_dir)
+ end
+ end
+
+ describe "load_dataset/2" do
+ @cache_dir "test_cache_load_dataset"
+ @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
+
+ setup do
+ on_exit(fn ->
+ File.rm_rf!(@cache_dir)
+
+ File.rm_rf!(
+ :filename.basedir(
+ :user_cache,
+ "elixir_datasets" <> "/huggingface/aaaaa32r--elixirDatasets"
+ )
+ )
+ end)
+ end
+
+ test "loads a dataset from Hugging Face" do
+ assert {:ok, datasets} = Loader.load_dataset(@repository)
+ assert is_list(datasets)
+ end
+
+ test "loads a dataset from Hugging Face without opts" do
+ repository_short = {:hf, "aaaaa32r/elixirDatasets"}
+ assert {:ok, datasets} = Loader.load_dataset(repository_short)
+ assert is_list(datasets)
+ end
+
+ test "loads a dataset from local directory" do
+ repository = {:local, "resources"}
+ assert {:ok, datasets} = Loader.load_dataset(repository)
+ assert is_list(datasets)
+ end
+
+ test "raises error when invalid local directory" do
+ repository = {:local, "invalid/path"}
+ assert {:error, _reason} = Loader.load_dataset(repository)
+ end
+
+ test "loads dataset offline" do
+ repository = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
+ assert {:ok, datasets} = Loader.load_dataset(repository)
+ assert is_list(datasets)
+
+ repository_offline =
+ {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]}
+
+ assert {:ok, datasets} = Loader.load_dataset(repository_offline)
+ assert is_list(datasets)
+
+ repository_offline_invalid = {:hf, "not/exists", [cache_dir: @cache_dir, offline: true]}
+ assert {:error, _reason} = Loader.load_dataset(repository_offline_invalid)
+ end
+
+ test "loads a dataset from Hugging Face with subdirectory" do
+ repository_subdir =
+ {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, subdir: "resources"]}
+
+ assert {:ok, datasets} = Loader.load_dataset(repository_subdir)
+ assert is_list(datasets)
+ end
+
+ test "returns error for non-existent dataset" do
+ repository = {:test, "nonexistent/repo", []}
+
+ assert_raise ArgumentError, fn ->
+ Loader.load_dataset(repository)
+ end
+ end
+
+ test "loads dataset with split parameter from local directory" do
+ repository = {:local, "resources"}
+ assert {:ok, datasets} = Loader.load_dataset(repository, split: "train")
+ assert is_list(datasets)
+ end
+
+ test "loads dataset with name parameter filters files" do
+ repository = {:local, "resources"}
+ assert {:ok, datasets} = Loader.load_dataset(repository, name: "csv")
+ assert is_list(datasets)
+ end
+
+ test "loads dataset with split and name parameters combined" do
+ repository = {:local, "resources"}
+
+ assert {:ok, datasets} =
+ Loader.load_dataset(repository, split: "train", name: "csv")
+
+ assert is_list(datasets)
+ end
+
+ test "loads dataset with download_mode option" do
+ repository = {:local, "resources"}
+
+ assert {:ok, datasets} =
+ Loader.load_dataset(repository, download_mode: :reuse_dataset_if_exists)
+
+ assert is_list(datasets)
+ end
+
+ test "loads dataset with verification_mode option" do
+ repository = {:local, "resources"}
+
+ assert {:ok, datasets} =
+ Loader.load_dataset(repository, verification_mode: :no_checks)
+
+ assert is_list(datasets)
+ end
+
+ test "loads dataset with num_proc for parallel processing" do
+ repository = {:local, "resources"}
+ assert {:ok, datasets} = Loader.load_dataset(repository, num_proc: 2)
+ assert is_list(datasets)
+ assert length(datasets) > 0
+ end
+
+ test "loads dataset with num_proc=1 (sequential)" do
+ repository = {:local, "resources"}
+ assert {:ok, datasets} = Loader.load_dataset(repository, num_proc: 1)
+ assert is_list(datasets)
+ end
+
+ test "num_proc=4 is faster than num_proc=1 for parallel loading" do
+ repository = @repository
+
+ {time_sequential, {:ok, datasets_seq}} =
+ :timer.tc(fn ->
+ Loader.load_dataset(repository, num_proc: 1)
+ end)
+
+ {time_parallel, {:ok, datasets_par}} =
+ :timer.tc(fn ->
+ Loader.load_dataset(repository, num_proc: 4)
+ end)
+
+ assert length(datasets_seq) == length(datasets_par)
+
+ total_rows_seq =
+ Enum.reduce(datasets_seq, 0, fn df, acc ->
+ acc + Explorer.DataFrame.n_rows(df)
+ end)
+
+ total_rows_par =
+ Enum.reduce(datasets_par, 0, fn df, acc ->
+ acc + Explorer.DataFrame.n_rows(df)
+ end)
+
+ assert total_rows_seq == total_rows_par
+
+ assert time_parallel <= time_sequential * 1.5,
+ "Parallel processing overhead should be reasonable for this dataset size"
+ end
+
+ test "num_proc produces same results as sequential" do
+ repository = {:local, "resources"}
+
+ {:ok, datasets_seq} = Loader.load_dataset(repository, num_proc: 1)
+ {:ok, datasets_par} = Loader.load_dataset(repository, num_proc: 4)
+
+ assert length(datasets_seq) == length(datasets_par)
+ seq_row_counts = Enum.map(datasets_seq, &Explorer.DataFrame.n_rows/1) |> Enum.sort()
+ par_row_counts = Enum.map(datasets_par, &Explorer.DataFrame.n_rows/1) |> Enum.sort()
+
+ assert seq_row_counts == par_row_counts
+ end
+ end
+
+ describe "load_dataset!/2" do
+ test "loads dataset successfully" do
+ repository = {:local, "resources"}
+ datasets = Loader.load_dataset!(repository)
+ assert is_list(datasets)
+ end
+
+ test "raises error on failure" do
+ repository = {:local, "invalid/path"}
+
+ assert_raise RuntimeError, fn ->
+ Loader.load_dataset!(repository)
+ end
+ end
+ end
+end
diff --git a/test/elixir_datasets/filter_test.exs b/test/elixir_datasets/filter_test.exs
new file mode 100644
index 0000000..84788f4
--- /dev/null
+++ b/test/elixir_datasets/filter_test.exs
@@ -0,0 +1,117 @@
+defmodule ElixirDatasets.FilterTest do
+ use ExUnit.Case, async: true
+ doctest ElixirDatasets.Filter
+
+ alias ElixirDatasets.Filter
+
+ describe "by_config_and_split/3" do
+ @sample_files %{
+ "train.csv" => "etag1",
+ "test.csv" => "etag2",
+ "validation.csv" => "etag3",
+ "sst2/train.parquet" => "etag4",
+ "sst2/test.parquet" => "etag5",
+ "cola/train.parquet" => "etag6"
+ }
+
+ test "returns all files when no filters applied" do
+ assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, nil, nil)
+ assert filtered == @sample_files
+ end
+
+ test "filters by split name" do
+ assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, nil, "train")
+ assert map_size(filtered) == 3
+ assert Map.has_key?(filtered, "train.csv")
+ assert Map.has_key?(filtered, "sst2/train.parquet")
+ assert Map.has_key?(filtered, "cola/train.parquet")
+ end
+
+ test "filters by config name" do
+ assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "sst2", nil)
+ assert map_size(filtered) == 2
+ assert Map.has_key?(filtered, "sst2/train.parquet")
+ assert Map.has_key?(filtered, "sst2/test.parquet")
+ end
+
+ test "filters by both config and split" do
+ assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "sst2", "train")
+ assert map_size(filtered) == 1
+ assert Map.has_key?(filtered, "sst2/train.parquet")
+ end
+
+ test "returns empty map when no matches" do
+ assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "nonexistent", nil)
+ assert map_size(filtered) == 0
+ end
+ end
+
+ describe "by_config_name/2" do
+ @sample_files %{
+ "train.csv" => "etag1",
+ "sst2/train.parquet" => "etag2",
+ "cola/train.parquet" => "etag3"
+ }
+
+ test "returns all files when config is nil" do
+ filtered = Filter.by_config_name(@sample_files, nil)
+ assert filtered == @sample_files
+ end
+
+ test "filters files by config name" do
+ filtered = Filter.by_config_name(@sample_files, "sst2")
+ assert is_map(filtered)
+ assert map_size(filtered) == 1
+ assert Map.has_key?(filtered, "sst2/train.parquet")
+ end
+
+ test "works with list input" do
+ files_list = [{"train.csv", "etag1"}, {"sst2/train.parquet", "etag2"}]
+ filtered = Filter.by_config_name(files_list, "sst2")
+ assert is_list(filtered)
+ assert length(filtered) == 1
+ assert {"sst2/train.parquet", "etag2"} in filtered
+ end
+ end
+
+ describe "by_split/2" do
+ @sample_files %{
+ "train.csv" => "etag1",
+ "test.csv" => "etag2",
+ "validation.csv" => "etag3"
+ }
+
+ test "returns all files when split is nil" do
+ filtered = Filter.by_split(@sample_files, nil)
+ assert filtered == @sample_files
+ end
+
+ test "filters files by split name" do
+ filtered = Filter.by_split(@sample_files, "train")
+ assert is_map(filtered)
+ assert map_size(filtered) == 1
+ assert Map.has_key?(filtered, "train.csv")
+ end
+
+ test "filters files with split in basename" do
+ files = %{
+ "train-00000.parquet" => "etag1",
+ "test-00000.parquet" => "etag2",
+ "train-00001.parquet" => "etag3"
+ }
+
+ filtered = Filter.by_split(files, "train")
+ assert map_size(filtered) == 2
+ assert Map.has_key?(filtered, "train-00000.parquet")
+ assert Map.has_key?(filtered, "train-00001.parquet")
+ end
+
+ test "works with list input" do
+ files_list = [{"train.csv", "etag1"}, {"test.csv", "etag2"}]
+ filtered = Filter.by_split(files_list, "train")
+ assert is_list(filtered)
+ assert length(filtered) == 1
+ assert {"train.csv", "etag1"} in filtered
+ end
+ end
+end
diff --git a/test/huggingface/hub_test.exs b/test/elixir_datasets/huggingface/hub_test.exs
similarity index 94%
rename from test/huggingface/hub_test.exs
rename to test/elixir_datasets/huggingface/hub_test.exs
index 0e86eff..d56e90f 100644
--- a/test/huggingface/hub_test.exs
+++ b/test/elixir_datasets/huggingface/hub_test.exs
@@ -185,11 +185,6 @@ defmodule ElixirDatasets.HuggingFace.HubTest do
{:ok, cached_path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts)
File.rm!(cached_path)
- IO.puts("\n π Testing verification_mode behavior:")
- IO.puts(" Cache file deleted: #{cached_path}")
-
- IO.puts("\n 1. With verification_mode: :basic_checks (offline)")
-
result_basic =
ElixirDatasets.HuggingFace.Hub.cached_download(
@url,
@@ -198,16 +193,12 @@ defmodule ElixirDatasets.HuggingFace.HubTest do
case result_basic do
{:error, msg} ->
- IO.puts(" β Failed as expected: #{msg}")
assert msg =~ "cached file not found"
{:ok, _} ->
- IO.puts(" β Should have failed!")
flunk("Expected :basic_checks to fail with missing file")
end
- IO.puts("\n 2. With verification_mode: :no_checks (offline)")
-
result_no_checks =
ElixirDatasets.HuggingFace.Hub.cached_download(
@url,
@@ -216,21 +207,13 @@ defmodule ElixirDatasets.HuggingFace.HubTest do
case result_no_checks do
{:ok, path} ->
- IO.puts(" β Succeeded (returns path without checking)")
- IO.puts(" β Returned path: #{path}")
- IO.puts(" β File exists? #{File.exists?(path)}")
assert path == cached_path
refute File.exists?(path)
{:error, msg} ->
- IO.puts(" β Should have succeeded!")
flunk("Expected :no_checks to succeed, got error: #{msg}")
end
- IO.puts("\n β
verification_mode works correctly!")
- IO.puts(" :basic_checks = validates file exists")
- IO.puts(" :no_checks = skips validation (faster but risky)")
-
File.rm_rf!(@cache_dir)
end
end
diff --git a/test/elixir_datasets/info_getter_test.exs b/test/elixir_datasets/info_getter_test.exs
new file mode 100644
index 0000000..0b9e611
--- /dev/null
+++ b/test/elixir_datasets/info_getter_test.exs
@@ -0,0 +1,91 @@
+defmodule ElixirDatasets.InfoTest do
+ use ExUnit.Case, async: true
+ doctest ElixirDatasets.Info
+
+ alias ElixirDatasets.Info
+
+ describe "get_dataset_info/2" do
+ test "fetches dataset info from Hugging Face API" do
+ assert {:ok, info} = Info.get_dataset_info("aaaaa32r/elixirDatasets")
+ assert is_map(info)
+ assert info["id"] == "aaaaa32r/elixirDatasets"
+
+ assert is_map(info["cardData"])
+ dataset_info = info["cardData"]["dataset_info"]
+ assert is_list(dataset_info)
+
+ first_config = Enum.at(dataset_info, 0)
+ assert first_config["config_name"] == "csv"
+ assert is_list(first_config["features"])
+ assert is_list(first_config["splits"])
+
+ first_split = Enum.at(first_config["splits"], 0)
+ assert first_split["num_examples"] == 10
+ end
+ end
+
+ describe "get_dataset_infos/2" do
+ test "fetches dataset infos as DatasetInfo structs" do
+ assert {:ok, infos} = Info.get_dataset_infos("aaaaa32r/elixirDatasets")
+ assert is_list(infos)
+ assert Enum.count(infos) > 0
+
+ first_info = Enum.at(infos, 0)
+ assert %ElixirDatasets.DatasetInfo{} = first_info
+ assert first_info.config_name == "csv"
+ assert is_list(first_info.features)
+ assert is_list(first_info.splits)
+ end
+ end
+
+ describe "parse_dataset_infos/1" do
+ test "parses raw dataset info map into DatasetInfo structs" do
+ data = %{
+ "cardData" => %{
+ "dataset_info" => [
+ %{
+ "config_name" => "csv",
+ "features" => [%{"name" => "id", "dtype" => "int64"}],
+ "splits" => [%{"name" => "train", "num_examples" => 10}]
+ }
+ ]
+ }
+ }
+
+ infos = Info.parse_dataset_infos(data)
+ assert is_list(infos)
+ assert Enum.count(infos) == 1
+
+ first_info = Enum.at(infos, 0)
+ assert %ElixirDatasets.DatasetInfo{} = first_info
+ assert first_info.config_name == "csv"
+ assert first_info.features == [%{"name" => "id", "dtype" => "int64"}]
+ assert first_info.splits == [%{"name" => "train", "num_examples" => 10}]
+ end
+
+ test "handles missing dataset_info gracefully" do
+ data = %{"cardData" => %{}}
+ infos = Info.parse_dataset_infos(data)
+ assert infos == []
+ end
+ end
+
+ describe "get_dataset_split_names/2" do
+ test "fetches split names from dataset" do
+ assert {:ok, splits} = Info.get_dataset_split_names("aaaaa32r/elixirDatasets")
+ assert is_list(splits)
+ assert Enum.count(splits) > 0
+ assert Enum.all?(splits, &is_binary/1)
+ end
+ end
+
+ describe "get_dataset_config_names/2" do
+ test "fetches config names from dataset" do
+ assert {:ok, configs} = Info.get_dataset_config_names("aaaaa32r/elixirDatasets")
+ assert is_list(configs)
+ assert Enum.count(configs) > 0
+ assert Enum.all?(configs, &is_binary/1)
+ assert Enum.member?(configs, "csv")
+ end
+ end
+end
diff --git a/test/elixir_datasets/repository_test.exs b/test/elixir_datasets/repository_test.exs
new file mode 100644
index 0000000..e218353
--- /dev/null
+++ b/test/elixir_datasets/repository_test.exs
@@ -0,0 +1,63 @@
+defmodule ElixirDatasets.RepositoryTest do
+ use ExUnit.Case, async: true
+ doctest ElixirDatasets.Repository
+
+ alias ElixirDatasets.Repository
+
+ describe "normalize!/1" do
+ test "normalizes {:hf, repository_id} format" do
+ assert {:hf, "user/repo", []} = Repository.normalize!({:hf, "user/repo"})
+ end
+
+ test "normalizes {:hf, repository_id, opts} format" do
+ opts = [revision: "main", cache_dir: "/tmp"]
+ {:hf, "user/repo", normalized_opts} = Repository.normalize!({:hf, "user/repo", opts})
+ assert Keyword.get(normalized_opts, :revision) == "main"
+ assert Keyword.get(normalized_opts, :cache_dir) == "/tmp"
+ end
+
+ test "normalizes {:local, dir} format" do
+ assert {:local, "/path/to/dir"} = Repository.normalize!({:local, "/path/to/dir"})
+ end
+
+ test "raises error for invalid format" do
+ assert_raise ArgumentError, fn ->
+ Repository.normalize!({:invalid, "repo"})
+ end
+ end
+
+ test "raises error for invalid options" do
+ assert_raise ArgumentError, fn ->
+ Repository.normalize!({:hf, "user/repo", [invalid_opt: true]})
+ end
+ end
+ end
+
+ describe "get_files/1" do
+ test "gets files from local directory" do
+ repository = {:local, "resources"}
+ assert {:ok, files} = Repository.get_files(repository)
+ assert is_map(files)
+ assert map_size(files) > 0
+ end
+
+ test "returns error for non-existent local directory" do
+ repository = {:local, "non_existent_dir"}
+ assert {:error, _reason} = Repository.get_files(repository)
+ end
+ end
+
+ describe "repository_id_to_cache_scope/1" do
+ test "converts repository ID to cache scope" do
+ assert "user--repo" = Repository.repository_id_to_cache_scope("user/repo")
+ end
+
+ test "removes special characters" do
+ assert "user--repo-name" = Repository.repository_id_to_cache_scope("user/repo-name")
+ end
+
+ test "handles underscores" do
+ assert "user--repo_name" = Repository.repository_id_to_cache_scope("user/repo_name")
+ end
+ end
+end
diff --git a/test/elixir_datasets/streaming_test.exs b/test/elixir_datasets/streaming_test.exs
new file mode 100644
index 0000000..f0d61ea
--- /dev/null
+++ b/test/elixir_datasets/streaming_test.exs
@@ -0,0 +1,138 @@
+defmodule ElixirDatasets.StreamingTest do
+ use ExUnit.Case, async: false
+ doctest ElixirDatasets.Streaming
+
+ alias ElixirDatasets.Loader
+
+ describe "streaming mode" do
+ @cache_dir "test_cache_streaming"
+ @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
+
+ setup do
+ on_exit(fn ->
+ File.rm_rf!(@cache_dir)
+
+ File.rm_rf!(
+ :filename.basedir(
+ :user_cache,
+ "elixir_datasets" <> "/huggingface/aaaaa32r--elixirDatasets"
+ )
+ )
+ end)
+ end
+
+ test "loads dataset with streaming parameter returns Stream" do
+ repository = {:local, "resources"}
+ assert {:ok, stream} = Loader.load_dataset(repository, streaming: true)
+
+ assert is_function(stream, 2), "Expected a Stream (function/2)"
+
+ rows = stream |> Enum.take(5)
+ assert is_list(rows)
+ assert Enum.all?(rows, &is_map/1), "Each row should be a map"
+ end
+
+ test "streaming mode fetches data progressively" do
+ repository = {:local, "resources"}
+ assert {:ok, stream} = Loader.load_dataset(repository, streaming: true)
+
+ rows = stream |> Enum.take(3)
+ assert length(rows) <= 3
+ assert Enum.all?(rows, &is_map/1)
+ end
+
+ test "streaming with custom batch_size" do
+ repository = {:local, "resources"}
+
+ assert {:ok, stream} =
+ Loader.load_dataset(
+ repository,
+ streaming: true,
+ batch_size: 2
+ )
+
+ rows = stream |> Enum.take(5)
+ assert is_list(rows)
+ end
+
+ test "streaming is lazy - data fetched on demand, not upfront" do
+ repository = {:local, "resources"}
+
+ {:ok, stream} = Loader.load_dataset(repository, streaming: true)
+
+ rows1 = stream |> Enum.take(3)
+ assert length(rows1) == 3
+
+ Process.sleep(2000)
+
+ rows2 = stream |> Enum.take(5)
+ assert length(rows2) == 5
+
+ fetch_count = :counters.new(1, [:atomics])
+
+ counted_stream =
+ stream
+ |> Stream.map(fn row ->
+ :counters.add(fetch_count, 1, 1)
+ row
+ end)
+
+ _small_batch = counted_stream |> Enum.take(2)
+ count_after_2 = :counters.get(fetch_count, 1)
+
+ :counters.put(fetch_count, 1, 0)
+
+ _large_batch = counted_stream |> Enum.take(10)
+ count_after_10 = :counters.get(fetch_count, 1)
+
+ assert count_after_2 <= 5, "Should fetch minimal rows for small take"
+ assert count_after_10 >= 8, "Should fetch more rows for larger take"
+ end
+
+ test "streaming from HuggingFace demonstrates progressive fetching" do
+ repository = @repository
+
+ {:ok, stream} = Loader.load_dataset(repository, streaming: true, batch_size: 5)
+
+ rows1 = stream |> Enum.take(3)
+ assert length(rows1) == 3
+
+ Process.sleep(1000)
+
+ rows2 = stream |> Enum.take(8)
+ assert length(rows2) == 8
+
+ result =
+ stream
+ |> Stream.filter(fn row -> Map.has_key?(row, "id") end)
+ |> Stream.take(5)
+ |> Enum.to_list()
+
+ assert length(result) <= 5
+ end
+
+ test "verification_mode works with streaming" do
+ repository = @repository
+
+ {:ok, stream1} =
+ Loader.load_dataset(
+ repository,
+ streaming: true,
+ verification_mode: :basic_checks
+ )
+
+ rows1 = stream1 |> Enum.take(2)
+ assert length(rows1) == 2
+
+ {:ok, stream2} =
+ Loader.load_dataset(
+ repository,
+ streaming: true,
+ verification_mode: :no_checks
+ )
+
+ rows2 = stream2 |> Enum.take(2)
+ assert length(rows2) == 2
+ end
+ end
+end
diff --git a/test/elixir_datasets_test.exs b/test/elixir_datasets_test.exs
index 4399d9e..ee7b547 100644
--- a/test/elixir_datasets_test.exs
+++ b/test/elixir_datasets_test.exs
@@ -10,42 +10,9 @@ defmodule ElixirDatasetsTest do
assert Code.ensure_loaded?(ElixirDatasets)
end
- describe "do_load_spec/2" do
- @cache_dir "test_cache_do_load_spec"
- @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
- @valid_repo_files %{
- "resources/csv-test.csv" => "\"2dccc814f47c01b5344abbb72367a5b322656b0b\""
- }
- @invalid_repo_files %{"invalid.csv" => "\"1234567890asdfgh\""}
-
- test "Loads valid files" do
- assert {:ok, _paths} = ElixirDatasets.do_load_spec(@repository, @valid_repo_files, 1)
- File.rm_rf!(@cache_dir)
- end
-
- test "Return error for invalid files" do
- assert {:error, _reason} = ElixirDatasets.do_load_spec(@repository, @invalid_repo_files, 1)
-
- File.rm_rf!(@cache_dir)
- end
- end
-
- describe "decode_config/1" do
- test "Decodes a valid JSON file" do
- File.write!("valid.json", ~s({"key": "value"}))
- assert {:ok, %{"key" => "value"}} = ElixirDatasets.decode_config("valid.json")
- File.rm!("valid.json")
- end
-
- test "Fails to decode JSON file" do
- File.write!("invalid.json", "{invalid_json}")
- assert {:error, _} = ElixirDatasets.decode_config("invalid.json")
- File.rm!("invalid.json")
- end
- end
-
- describe "load_dataset/2" do
- @cache_dir "test_cache_load_dataset"
+ # Integration tests for public API
+ describe "load_dataset/2 - Public API Integration Tests" do
+ @cache_dir "test_cache_integration"
@repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
setup do
@@ -66,332 +33,27 @@ defmodule ElixirDatasetsTest do
assert is_list(datasets)
end
- test "loads a dataset from Hugging Face without opts" do
- repository_short = {:hf, "aaaaa32r/elixirDatasets"}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository_short)
- assert is_list(datasets)
- end
-
test "loads a dataset from local directory" do
repository = {:local, "resources"}
assert {:ok, datasets} = ElixirDatasets.load_dataset(repository)
assert is_list(datasets)
end
+ end
- test "raise error when invalid local directory" do
- repository = {:local, "invalid/path"}
- assert {:error, _reason} = ElixirDatasets.load_dataset(repository)
- end
-
- test "loads dataset offline" do
- repository = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository)
- assert is_list(datasets)
- repositoryOffline = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repositoryOffline)
- assert is_list(datasets)
- repositoryOfflineInvalid = {:hf, "not/exists", [cache_dir: @cache_dir, offline: true]}
- assert {:error, _reason} = ElixirDatasets.load_dataset(repositoryOfflineInvalid)
- end
-
- test "loads a dataset from Hugging Face with subdirectory" do
- repositorySubdir =
- {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, subdir: "resources"]}
-
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repositorySubdir)
- assert is_list(datasets)
- end
-
- test "returns error for non-existent dataset" do
- repository = {:test, "nonexistent/repo", []}
-
- assert_raise ArgumentError, fn ->
- ElixirDatasets.load_dataset(repository)
- end
- end
-
- test "loads dataset with split parameter from local directory" do
- repository = {:local, "resources"}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, split: "train")
- assert is_list(datasets)
- end
-
- test "loads dataset with name parameter filters files" do
- repository = {:local, "resources"}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, name: "csv")
- assert is_list(datasets)
- end
-
- test "loads dataset with streaming parameter returns Stream" do
- repository = {:local, "resources"}
- assert {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true)
-
- assert is_function(stream, 2), "Expected a Stream (function/2)"
-
- rows = stream |> Enum.take(5)
- assert is_list(rows)
- assert Enum.all?(rows, &is_map/1), "Each row should be a map"
- end
-
- test "streaming mode fetches data progressively" do
- repository = {:local, "resources"}
- assert {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true)
-
- rows = stream |> Enum.take(3)
- assert length(rows) <= 3
- assert Enum.all?(rows, &is_map/1)
- end
-
- test "streaming with custom batch_size" do
- repository = {:local, "resources"}
-
- assert {:ok, stream} =
- ElixirDatasets.load_dataset(
- repository,
- streaming: true,
- batch_size: 2
- )
-
- rows = stream |> Enum.take(5)
- assert is_list(rows)
- end
-
- test "streaming is lazy - data fetched on demand, not upfront" do
- repository = {:local, "resources"}
-
- {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true)
-
- IO.puts("\n π Testing lazy streaming behavior:")
-
- IO.puts(" 1. Fetching first 3 rows...")
-
- {time1, rows1} =
- :timer.tc(fn ->
- stream |> Enum.take(3)
- end)
-
- IO.puts(" β Got #{length(rows1)} rows in #{time1 / 1000}ms")
- assert length(rows1) == 3
-
- IO.puts(" 2. Waiting 2 seconds...")
- Process.sleep(2000)
-
- IO.puts(" 3. Fetching 5 rows from same stream...")
-
- {time2, rows2} =
- :timer.tc(fn ->
- stream |> Enum.take(5)
- end)
-
- IO.puts(" β Got #{length(rows2)} rows in #{time2 / 1000}ms")
- assert length(rows2) == 5
-
- IO.puts(" 4. Key insight: Stream is reusable, each Enum.take starts fresh")
-
- IO.puts(" 5. Demonstrating progressive fetching...")
-
- fetch_count = :counters.new(1, [:atomics])
-
- counted_stream =
- stream
- |> Stream.map(fn row ->
- :counters.add(fetch_count, 1, 1)
- row
- end)
-
- IO.puts(" Taking 2 rows...")
- _small_batch = counted_stream |> Enum.take(2)
- count_after_2 = :counters.get(fetch_count, 1)
- IO.puts(" β Fetched #{count_after_2} rows (should be ~2)")
-
- :counters.put(fetch_count, 1, 0)
-
- IO.puts(" Taking 10 rows...")
- _large_batch = counted_stream |> Enum.take(10)
- count_after_10 = :counters.get(fetch_count, 1)
- IO.puts(" β Fetched #{count_after_10} rows (should be ~10)")
-
- assert count_after_2 <= 5, "Should fetch minimal rows for small take"
- assert count_after_10 >= 8, "Should fetch more rows for larger take"
-
- IO.puts(" β
Streaming is truly lazy - fetches only what's needed!")
- end
-
- test "streaming from HuggingFace demonstrates progressive fetching" do
- repository = @repository
-
- IO.puts("\n π Testing HuggingFace streaming:")
-
- {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true, batch_size: 5)
- IO.puts(" β Created stream (no data downloaded yet)")
-
- IO.puts(" 1. Fetching only 3 rows...")
-
- {time1, rows1} =
- :timer.tc(fn ->
- stream |> Enum.take(3)
- end)
-
- IO.puts(" β Got #{length(rows1)} rows in #{Float.round(time1 / 1000, 2)}ms")
- assert length(rows1) == 3
-
- IO.puts(" 2. Waiting 1 second...")
- Process.sleep(1000)
-
- IO.puts(" 3. Fetching 8 rows from same stream...")
-
- {time2, rows2} =
- :timer.tc(fn ->
- stream |> Enum.take(8)
- end)
-
- IO.puts(" β Got #{length(rows2)} rows in #{Float.round(time2 / 1000, 2)}ms")
- assert length(rows2) == 8
-
- IO.puts(" 4. Processing with Stream operations (lazy)...")
-
- result =
- stream
- |> Stream.filter(fn row -> Map.has_key?(row, "id") end)
- |> Stream.take(5)
- |> Enum.to_list()
-
- IO.puts(" β Processed and got #{length(result)} filtered rows")
- assert length(result) <= 5
-
- IO.puts(" β
HuggingFace streaming works progressively!")
- end
-
- test "verification_mode works with streaming" do
- repository = @repository
-
- IO.puts("\n π Testing verification_mode with streaming:")
-
- IO.puts(" 1. With verification_mode: :basic_checks (default)...")
-
- {:ok, stream1} =
- ElixirDatasets.load_dataset(
- repository,
- streaming: true,
- verification_mode: :basic_checks
- )
-
- rows1 = stream1 |> Enum.take(2)
- IO.puts(" β Got #{length(rows1)} rows")
- assert length(rows1) == 2
-
- IO.puts(" 2. With verification_mode: :no_checks...")
-
- {:ok, stream2} =
- ElixirDatasets.load_dataset(
- repository,
- streaming: true,
- verification_mode: :no_checks
- )
-
- rows2 = stream2 |> Enum.take(2)
- IO.puts(" β Got #{length(rows2)} rows")
- assert length(rows2) == 2
-
- IO.puts(" βΉοΈ Note: verification_mode applies to metadata fetching,")
- IO.puts(" not to the streaming data itself (which comes from URLs)")
- IO.puts(" β
verification_mode works with streaming!")
- end
-
- test "loads dataset with split and name parameters combined" do
- repository = {:local, "resources"}
-
- assert {:ok, datasets} =
- ElixirDatasets.load_dataset(repository, split: "train", name: "csv")
-
- assert is_list(datasets)
- end
-
- test "loads dataset with download_mode option" do
- repository = {:local, "resources"}
-
- assert {:ok, datasets} =
- ElixirDatasets.load_dataset(repository, download_mode: :reuse_dataset_if_exists)
-
- assert is_list(datasets)
- end
-
- test "loads dataset with verification_mode option" do
- repository = {:local, "resources"}
-
- assert {:ok, datasets} =
- ElixirDatasets.load_dataset(repository, verification_mode: :no_checks)
-
- assert is_list(datasets)
- end
-
- test "loads dataset with num_proc for parallel processing" do
- repository = {:local, "resources"}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, num_proc: 2)
- assert is_list(datasets)
- assert length(datasets) > 0
- end
-
- test "loads dataset with num_proc=1 (sequential)" do
+ describe "load_dataset!/2 - Public API" do
+ test "loads dataset successfully" do
repository = {:local, "resources"}
- assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, num_proc: 1)
+ datasets = ElixirDatasets.load_dataset!(repository)
assert is_list(datasets)
end
- test "num_proc=4 is faster than num_proc=1 for parallel loading" do
- repository = @repository
-
- {time_sequential, {:ok, datasets_seq}} =
- :timer.tc(fn ->
- ElixirDatasets.load_dataset(repository, num_proc: 1)
- end)
-
- {time_parallel, {:ok, datasets_par}} =
- :timer.tc(fn ->
- ElixirDatasets.load_dataset(repository, num_proc: 4)
- end)
-
- assert length(datasets_seq) == length(datasets_par)
-
- total_rows_seq =
- Enum.reduce(datasets_seq, 0, fn df, acc ->
- acc + Explorer.DataFrame.n_rows(df)
- end)
-
- total_rows_par =
- Enum.reduce(datasets_par, 0, fn df, acc ->
- acc + Explorer.DataFrame.n_rows(df)
- end)
-
- assert total_rows_seq == total_rows_par
-
- time_seq_sec = time_sequential / 1_000_000
- time_par_sec = time_parallel / 1_000_000
- speedup = time_sequential / time_parallel
-
- IO.puts("\n β±οΈ Performance Comparison:")
- IO.puts(" Sequential (num_proc: 1): #{Float.round(time_seq_sec, 3)}s")
- IO.puts(" Parallel (num_proc: 4): #{Float.round(time_par_sec, 3)}s")
- IO.puts(" Speedup: #{Float.round(speedup, 2)}x")
-
- assert time_parallel <= time_sequential * 1.5,
- "Parallel processing overhead should be reasonable for this dataset size (no more than 1.5x slower than sequential)"
- end
-
- test "num_proc produces same results as sequential" do
- repository = {:local, "resources"}
-
- {:ok, datasets_seq} = ElixirDatasets.load_dataset(repository, num_proc: 1)
- {:ok, datasets_par} = ElixirDatasets.load_dataset(repository, num_proc: 4)
-
- assert length(datasets_seq) == length(datasets_par)
- seq_row_counts = Enum.map(datasets_seq, &Explorer.DataFrame.n_rows/1) |> Enum.sort()
- par_row_counts = Enum.map(datasets_par, &Explorer.DataFrame.n_rows/1) |> Enum.sort()
+ test "raises error on failure" do
+ repository = {:local, "invalid/path"}
- assert seq_row_counts == par_row_counts
+ assert_raise RuntimeError, fn ->
+ ElixirDatasets.load_dataset!(repository)
+ end
end
-
- # todo more tests for load_dataset/2
end
describe "cache_dir/0" do
@@ -421,87 +83,36 @@ defmodule ElixirDatasetsTest do
end
end
- describe "get_dataset_info/2" do
+ # Public API tests for dataset info functions
+ describe "get_dataset_info/2 - Public API" do
test "fetches dataset info from Hugging Face API" do
assert {:ok, info} = ElixirDatasets.get_dataset_info("aaaaa32r/elixirDatasets")
assert is_map(info)
assert info["id"] == "aaaaa32r/elixirDatasets"
-
- assert is_map(info["cardData"])
- dataset_info = info["cardData"]["dataset_info"]
- assert is_list(dataset_info)
-
- first_config = Enum.at(dataset_info, 0)
- assert first_config["config_name"] == "csv"
- assert is_list(first_config["features"])
- assert is_list(first_config["splits"])
-
- first_split = Enum.at(first_config["splits"], 0)
- assert first_split["num_examples"] == 10
end
end
- describe "get_dataset_infos/2" do
+ describe "get_dataset_infos/2 - Public API" do
test "fetches dataset infos as DatasetInfo structs" do
assert {:ok, infos} = ElixirDatasets.get_dataset_infos("aaaaa32r/elixirDatasets")
assert is_list(infos)
assert Enum.count(infos) > 0
-
- first_info = Enum.at(infos, 0)
- assert %ElixirDatasets.DatasetInfo{} = first_info
- assert first_info.config_name == "csv"
- assert is_list(first_info.features)
- assert is_list(first_info.splits)
- end
- end
-
- describe "parse_dataset_infos/1" do
- test "parses raw dataset info map into DatasetInfo structs" do
- data = %{
- "cardData" => %{
- "dataset_info" => [
- %{
- "config_name" => "csv",
- "features" => [%{"name" => "id", "dtype" => "int64"}],
- "splits" => [%{"name" => "train", "num_examples" => 10}]
- }
- ]
- }
- }
-
- infos = ElixirDatasets.parse_dataset_infos(data)
- assert is_list(infos)
- assert Enum.count(infos) == 1
-
- first_info = Enum.at(infos, 0)
- assert %ElixirDatasets.DatasetInfo{} = first_info
- assert first_info.config_name == "csv"
- assert first_info.features == [%{"name" => "id", "dtype" => "int64"}]
- assert first_info.splits == [%{"name" => "train", "num_examples" => 10}]
- end
-
- test "handles missing dataset_info gracefully" do
- data = %{"cardData" => %{}}
- infos = ElixirDatasets.parse_dataset_infos(data)
- assert infos == []
end
end
- describe "get_dataset_split_names/2" do
+ describe "get_dataset_split_names/2 - Public API" do
test "fetches split names from dataset" do
assert {:ok, splits} = ElixirDatasets.get_dataset_split_names("aaaaa32r/elixirDatasets")
assert is_list(splits)
assert Enum.count(splits) > 0
- assert Enum.all?(splits, &is_binary/1)
end
end
- describe "get_dataset_config_names/2" do
+ describe "get_dataset_config_names/2 - Public API" do
test "fetches config names from dataset" do
assert {:ok, configs} = ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets")
assert is_list(configs)
assert Enum.count(configs) > 0
- assert Enum.all?(configs, &is_binary/1)
assert Enum.member?(configs, "csv")
end
end