Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lib/braintrust/api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative "api/datasets"
require_relative "api/functions"
require_relative "api/btql"

module Braintrust
# API client for Braintrust REST API
Expand Down Expand Up @@ -42,5 +43,11 @@ def login
def object_permalink(object_type:, object_id:)
@state.object_permalink(object_type: object_type, object_id: object_id)
end

# Access to BTQL API
# @return [API::Btql]
def btql
@btql ||= API::Btql.new(self)
end
end
end
86 changes: 86 additions & 0 deletions lib/braintrust/api/btql.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# frozen_string_literal: true

require "net/http"
require "json"
require "uri"
require_relative "../logger"

module Braintrust
class API
# BTQL API namespace
# Provides methods for querying spans and other data using BTQL
class Btql
def initialize(api)
@api = api
@state = api.state
end

# Query spans using BTQL
# POST /btql
# @param query [Hash] AST-based query filter
# @param object_type [String] Type of object (e.g., "experiment")
# @param object_id [String] Object ID
# @param fmt [String] Response format (default: "jsonl")
# @return [Hash] Response with :body, :freshness_state
def query(query:, object_type:, object_id:, fmt: "jsonl")
payload = {
query: query,
object_type: object_type,
object_id: object_id,
fmt: fmt
}

response = http_post_json_raw("/btql", payload)

{
body: response.body,
freshness_state: response["x-bt-freshness-state"] || "complete"
}
end

private

# Core HTTP request method (copied from datasets.rb pattern)
def http_request(method, path, params: {}, payload: nil, base_url: nil, parse_json: true)
base = base_url || @state.api_url
uri = URI("#{base}#{path}")
uri.query = URI.encode_www_form(params) unless params.empty?

request = case method
when :get
Net::HTTP::Get.new(uri)
when :post
req = Net::HTTP::Post.new(uri)
req["Content-Type"] = "application/json"
req.body = JSON.dump(payload) if payload
req
else
raise ArgumentError, "Unsupported HTTP method: #{method}"
end

request["Authorization"] = "Bearer #{@state.api_key}"

start_time = Time.now
Log.debug("[API] #{method.upcase} #{uri}")

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = (uri.scheme == "https")
response = http.request(request)

duration_ms = ((Time.now - start_time) * 1000).round(2)
Log.debug("[API] #{method.upcase} #{uri} -> #{response.code} (#{duration_ms}ms, #{response.body.bytesize} bytes)")

unless response.is_a?(Net::HTTPSuccess)
Log.debug("[API] Error response body: #{response.body}")
raise Error, "HTTP #{response.code} for #{method.upcase} #{uri}: #{response.body}"
end

parse_json ? JSON.parse(response.body) : response
end

def http_post_json_raw(path, payload)
http_request(:post, path, payload: payload, parse_json: false)
end
end
end
end
38 changes: 23 additions & 15 deletions lib/braintrust/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -241,23 +241,31 @@ def run(project:, experiment:, task:, scorers:,
project_id = project_result["id"]
project_name = project_result["name"]

# Instantiate Runner and run evaluation
runner = Runner.new(
experiment_id: experiment_id,
experiment_name: experiment,
project_id: project_id,
project_name: project_name,
task: task,
scorers: scorers,
api: api,
tracer_provider: tracer_provider
)
result = runner.run(cases, parallelism: parallelism)
# Enable span cache for evaluation
api.state.span_cache.start

begin
# Instantiate Runner and run evaluation
runner = Runner.new(
experiment_id: experiment_id,
experiment_name: experiment,
project_id: project_id,
project_name: project_name,
task: task,
scorers: scorers,
api: api,
tracer_provider: tracer_provider
)
result = runner.run(cases, parallelism: parallelism)

# Print result summary unless quiet
print_result(result) unless quiet
# Print result summary unless quiet
print_result(result) unless quiet

result
result
ensure
# Disable and clear span cache after evaluation
api.state.span_cache.stop
end
end

private
Expand Down
29 changes: 25 additions & 4 deletions lib/braintrust/eval/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
require_relative "result"
require_relative "summary"
require_relative "../internal/thread_pool"
require_relative "../trace_context"

require "opentelemetry/sdk"
require "json"
Expand Down Expand Up @@ -103,8 +104,11 @@ def run_case(test_case, errors)
end

# Run scorers
# Create TraceContext for scorers (if scorers exist)
trace = scorers.empty? ? nil : create_trace_context(eval_span)

begin
run_scorers(test_case, output)
run_scorers(test_case, output, trace)
rescue => e
# Error already recorded on score span, set eval span status
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
Expand Down Expand Up @@ -149,15 +153,16 @@ def run_task(test_case)
# Creates single score span for all scorers
# @param test_case [Case] The test case
# @param output [Object] Task output
def run_scorers(test_case, output)
# @param trace [TraceContext, nil] Optional trace context for scorers
def run_scorers(test_case, output, trace = nil)
tracer.in_span("score") do |score_span|
score_span.set_attribute("braintrust.parent", parent_attr)
set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
set_json_attr(score_span, "braintrust.span_attributes", {type: "score", purpose: "scorer"})

scores = {}
scorer_error = nil
scorers.each do |scorer|
score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {}, trace)
scores[scorer.name] = score_value

# Collect raw score for summary (thread-safe)
Expand Down Expand Up @@ -239,6 +244,22 @@ def collect_score(name, value)
(@scores[name] ||= []) << value
end
end

# Create a TraceContext for scorers to access span data
# @param eval_span [OpenTelemetry::Trace::Span] The eval span
# @return [TraceContext]
def create_trace_context(eval_span)
# Extract root_span_id from the eval span's trace_id
root_span_id = eval_span.context.trace_id.unpack1("H*")

TraceContext.new(
object_type: "experiment",
object_id: experiment_id,
root_span_id: root_span_id,
state: @api.state,
ensure_spans_flushed: -> { @tracer_provider.force_flush }
)
end
end
end
end
28 changes: 18 additions & 10 deletions lib/braintrust/eval/scorer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
module Braintrust
module Eval
# Scorer wraps a scoring function that evaluates task output against expected values
# Scorers can accept 3 params (input, expected, output) or 4 params (input, expected, output, metadata)
# Scorers can accept 3 params (input, expected, output), 4 params (input, expected, output, metadata),
# or 5 params (input, expected, output, metadata, trace)
# They can return a float, hash, or array of hashes
class Scorer
attr_reader :name
Expand Down Expand Up @@ -43,9 +44,10 @@ def initialize(name_or_callable = nil, callable = nil, &block)
# @param expected [Object] The expected output
# @param output [Object] The actual output from the task
# @param metadata [Hash] Optional metadata
# @param trace [TraceContext, nil] Optional trace context
# @return [Float, Hash, Array] Score value(s)
def call(input, expected, output, metadata = {})
@wrapped_callable.call(input, expected, output, metadata)
def call(input, expected, output, metadata = {}, trace = nil)
@wrapped_callable.call(input, expected, output, metadata, trace)
end

private
Expand All @@ -68,25 +70,31 @@ def detect_name(callable)
"scorer"
end

# Wrap the callable to always accept 4 parameters
# Wrap the callable to always accept 5 parameters
# @param callable [#call] The callable to wrap
# @return [Proc] Wrapped callable that accepts 4 params
# @return [Proc] Wrapped callable that accepts 5 params
def wrap_callable(callable)
arity = callable_arity(callable)

case arity
when 3
# Callable takes 3 params - wrap to ignore metadata
->(input, expected, output, metadata) {
# Callable takes 3 params - wrap to ignore metadata and trace
->(input, expected, output, metadata, trace) {
callable.call(input, expected, output)
}
when 4, -4, -1
# Callable takes 4 params (or variadic with 4+)
when 4, -4
# Callable takes 4 params - wrap to ignore trace
# -4 means optional 4th param
->(input, expected, output, metadata, trace) {
callable.call(input, expected, output, metadata)
}
when 5, -5, -1
# Callable takes 5 params (or variadic with 5+)
# -5 means optional 5th param
# -1 means variadic (*args)
callable
else
raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
raise ArgumentError, "Scorer must accept 3, 4, or 5 parameters (got arity #{arity})"
end
end

Expand Down
Loading