This section shows how to use PPLNN step by step with an example api_intro.cc. Refer to API Reference for more details.
In PPLNN, an Engine is a collection of op implementations running on specified devices such as CPU or NVIDIA GPU. For example, we can use the built-in X86EngineFactory:
Engine* X86EngineFactory::Create();to create an engine which runs on x86-compatible CPUs:
X86EngineOptions x86_options;
Engine* x86_engine = X86EngineFactory::Create(x86_options);Or use
CudaEngineOptions cuda_options;
// ... set options
Engine* CudaEngineFactory::Create(cuda_options);to create an engine running on NVIDIA GPUs.
We create a RuntimeBuilder with the following function:
RuntimeBuilder* OnnxRuntimeBuilderFactory::Create(
const char* model_file, Engine** engines, uint32_t engine_num);where the second parameter engines is the x86_engine we created:
vector<unique_ptr<Engine>> engines;
engines.emplace_back(unique_ptr<Engine>(x86_engine));PPLNN supports multiple engines running in the same model. For example:
Engine* x86_engine = X86EngineFactory::Create(X86EngineOptions());
Engine* cuda_engine = CudaEngineFactory::Create(CudaEngineOptions());
vector<unique_ptr<Engine>> engines;
engines.emplace_back(unique_ptr<Engine>(x86_engine));
engines.emplace_back(unique_ptr<Engine>(cuda_engine));
// TODO add other engines
const char* model_file = "/path/to/onnx/model";
// use x86 and cuda engines to run this model
vector<Engine*> engine_ptrs = {x86_engine.get(), cuda_engine.get()};
RuntimeBuilder* builder = OnnxRuntimeBuilderFactory::Create(model_file, engine_ptrs.data(), engine_ptrs.size());PPLNN will partition the model and assign different ops to these engines according to configurations.
We can use
Runtime* RuntimeBuilder::CreateRuntime();to create a Runtime:
Runtime* runtime = builder->CreateRuntime();We can get graph inputs using the following functions of Runtime:
uint32_t Runtime::GetInputCount() const;
Tensor* Runtime::GetInputTensor(uint32_t idx) const;and fill input data(using random data in this example):
for (uint32_t c = 0; c < runtime->GetInputCount(); ++c) {
auto t = runtime->GetInputTensor(c);
auto& shape = t->GetShape();
auto nr_element = shape.GetBytesIncludingPadding() / sizeof(float);
vector<float> buffer(nr_element);
// fill random input data
std::default_random_engine eng;
std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
for (uint32_t i = 0; i < nr_element; ++i) {
buffer[i] = dis(eng);
}
auto status = t->ReallocBuffer();
if (status != RC_SUCCESS) {
// ......
}
// our random data is treated as NDARRAY
TensorShape src_desc = t->GetShape();
src_desc.SetDataFormat(DATAFORMAT_NDARRAY);
// input tensors may require different data format
status = t->ConvertFromHost((const void*)buffer.data(), src_desc);
if (status != RC_SUCCESS) {
// ......
}
}use the Runtime::Run():
RetCode status = runtime->Run();Before getting results we must wait for all operations to finish(some engine may run asynchronously):
RetCode status = runtime->Sync();Then iterate each output:
for (uint32_t c = 0; c < runtime->GetOutputCount(); ++c) {
auto t = runtime->GetOutputTensor(c);
TensorShape dst_desc = t->GetShape();
dst_desc.SetDataFormat(DATAFORMAT_NDARRAY);
auto bytes = dst_desc.GetBytesIncludingPadding();
vector<char> buffer(bytes);
auto status = t->ConvertToHost((void*)buffer.data(), dst_desc);
// ......
}