diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f6f288d9906..89715eaea07 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -372,9 +372,14 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); SYCL_CHECK( CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); + // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU. + // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here. + char* host_buf = (char*)malloc(size); + memcpy(host_buf, data, size); SYCL_CHECK( - CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, data, size) + CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size) .wait())); + free(host_buf); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index 8d4fece2dca..f9bcadae022 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -139,6 +139,16 @@ def wrapped_fn(*args, **kwargs): if isinstance(res, cls._tensor_type): return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn) + elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res): + # share the evaluation between lazy tuple elements + shared_args: list = [args, None] + + def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase: + assert len(a) == 2 + if a[1] is None: + a[1] = fn(*a[0], **kw) + return a[1][i] + return tuple(cls(meta=cls.eager_to_meta(res[i]), args=(shared_args, i), kwargs=kwargs, func=eager_tuple_element) for i in range(len(res))) else: del res # not needed # non-tensor return likely relies on the contents of the args