embedanythingindart/lib/src/models.dart at main · cotw-fabier/embedanythingindart · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/// Supported embedding model architectures.
///
/// Different model types use different underlying architectures and
/// tokenization strategies. Choose based on your use case:
/// - [bert]: General-purpose, fast, good quality
/// - [jina]: Optimized for semantic search, higher quality
///
/// Example:
/// ```dart
/// final embedder = EmbedAnything.fromPretrainedHf(
///   model: EmbeddingModel.bert,
///   modelId: 'sentence-transformers/all-MiniLM-L6-v2',
/// );
/// ```
enum EmbeddingModel {
  /// BERT-based models.
  ///
  /// BERT (Bidirectional Encoder Representations from Transformers)
  /// models are general-purpose sentence embedding models that work
  /// well for most semantic similarity tasks.
  ///
  /// Common BERT models:
  /// - `sentence-transformers/all-MiniLM-L6-v2` (384 dim, fast)
  /// - `sentence-transformers/all-MiniLM-L12-v2` (384 dim, better quality)
  ///
  /// Best for:
  /// - General semantic similarity
  /// - Fast inference requirements
  /// - Moderate quality requirements
  ///
  /// Performance:
  /// - Model load (warm cache): ~100ms
  /// - Single embedding latency (short text): ~5-10ms
  bert(0),

  /// Jina embedding models.
  ///
  /// Jina models are specifically optimized for semantic search
  /// and retrieval tasks, offering higher quality at the cost of
  /// slightly slower inference.
  ///
  /// Common Jina models:
  /// - `jinaai/jina-embeddings-v2-small-en` (512 dim, fast)
  /// - `jinaai/jina-embeddings-v2-base-en` (768 dim, high quality)
  ///
  /// Best for:
  /// - Semantic search applications
  /// - High-quality similarity matching
  /// - Document retrieval systems
  ///
  /// Performance:
  /// - Model load (warm cache): ~150ms
  /// - Single embedding latency (short text): ~10-15ms
  jina(1);

  const EmbeddingModel(this.value);

  /// Numeric value passed to Rust FFI.
  ///
  /// This internal value is used for communication with the native
  /// Rust layer and should not be used directly by applications.
  final int value;
}

/// Model data type for weights.
///
/// Determines the precision of model weights during inference.
/// Lower precision types (F16) provide faster inference and lower
/// memory usage at the cost of slightly reduced quality.
///
/// Example:
/// ```dart
/// // Use F16 for faster inference on resource-constrained systems
/// final config = ModelConfig(
///   modelId: 'sentence-transformers/all-MiniLM-L6-v2',
///   modelType: EmbeddingModel.bert,
///   dtype: ModelDtype.f16,
/// );
/// final embedder = EmbedAnything.fromConfig(config);
/// ```
///
/// Performance comparison (BERT all-MiniLM-L6-v2):
/// - F32: 100% quality, ~90MB memory, baseline speed
/// - F16: 99% quality, ~45MB memory, ~1.3x faster
///
/// See also:
/// - [ModelConfig] for configuring models
enum ModelDtype {
  /// 32-bit floating point (full precision).
  ///
  /// This is the default and recommended option for most use cases.
  /// Provides the highest quality embeddings at the cost of larger
  /// memory footprint and slightly slower inference.
  ///
  /// Memory usage (typical models):
  /// - BERT all-MiniLM-L6-v2: ~90MB
  /// - Jina v2-base-en: ~280MB
  ///
  /// Use when:
  /// - Quality is the top priority
  /// - Memory is not a constraint
  /// - Reproducibility across platforms is important
  f32(0),

  /// 16-bit floating point (half precision).
  ///
  /// Reduces memory usage by approximately 50% and can provide
  /// faster inference on supported hardware. The quality difference
  /// is typically negligible for most applications.
  ///
  /// Memory usage (typical models):
  /// - BERT all-MiniLM-L6-v2: ~45MB
  /// - Jina v2-base-en: ~140MB
  ///
  /// Use when:
  /// - Running on resource-constrained devices
  /// - Memory usage is a concern
  /// - Speed is more important than maximum quality
  ///
  /// Note: Not all platforms support F16 acceleration. On unsupported
  /// platforms, the model may fall back to F32 internally.
  f16(1);

  const ModelDtype(this.value);

  /// Numeric value passed to Rust FFI.
  ///
  /// Mapping:
  /// - 0 = F32 (full precision)
  /// - 1 = F16 (half precision)
  /// - -1 = default/None (handled in Rust)
  ///
  /// This internal value is used for communication with the native
  /// Rust layer and should not be used directly by applications.
  final int value;
}

/// Available compute device types for embedding operations.
///
/// The library automatically selects the best available device at model
/// load time based on compiled features and hardware availability:
/// 1. Metal (on macOS/iOS if GPU is available)
/// 2. CUDA (on Linux/Windows if NVIDIA GPU is available)
/// 3. CPU (fallback, always available)
///
/// Use [EmbedAnything.getActiveDevice] to query which device is being used,
/// or [EmbedAnything.isDeviceAvailable] to check availability.
///
/// Example:
/// ```dart
/// // Check what device will be used for embedding
/// final device = EmbedAnything.getActiveDevice();
/// print('Using: $device'); // e.g., "ComputeDevice.metal"
///
/// // Check if CUDA is available
/// if (EmbedAnything.isDeviceAvailable(ComputeDevice.cuda)) {
///   print('CUDA acceleration available!');
/// }
/// ```
///
/// Note: Device selection happens at compile time based on platform:
/// - macOS/iOS: Metal + Accelerate features enabled
/// - Linux/Windows: MKL + CUDA (if toolkit detected) features enabled
enum ComputeDevice {
  /// CPU computation (always available).
  ///
  /// This is the fallback device used when no GPU acceleration is available.
  /// CPU operations can be optimized with:
  /// - Intel MKL on Linux/Windows
  /// - Apple Accelerate on macOS
  cpu(0),

  /// NVIDIA CUDA GPU acceleration.
  ///
  /// Available on Linux and Windows systems with:
  /// - NVIDIA GPU hardware
  /// - CUDA toolkit installed (detected at build time)
  ///
  /// Provides significant speedup for embedding operations, especially
  /// for batch processing.
  cuda(1),

  /// Apple Metal GPU acceleration.
  ///
  /// Available on macOS and iOS with Apple Silicon or AMD GPU.
  /// Automatically used on Apple platforms when available.
  metal(2);

  const ComputeDevice(this.value);

  /// Numeric value passed to Rust FFI.
  final int value;

  /// Create a ComputeDevice from its numeric value.
  ///
  /// Returns [cpu] for unknown values.
  static ComputeDevice fromValue(int value) {
    return ComputeDevice.values.firstWhere(
      (d) => d.value == value,
      orElse: () => ComputeDevice.cpu,
    );
  }
}