@@ -69,15 +69,17 @@ impl ResultStreamExt<anyhow::Error> for crate::BoxStream<ChatCompletionMessage,
6969 anyhow:: Ok ( message?) . with_context ( || "Failed to process message stream" ) ?;
7070 // Process usage information
7171 // - For Anthropic-style streaming: input tokens in MessageStart, output tokens
72- // in MessageDelta
72+ // in MessageDelta (values are CUMULATIVE, not incremental)
73+ // ref: https://platform.claude.com/docs/en/build-with-claude/streaming#event-types
7374 // - For OpenAI-style streaming: all tokens in the final chunk
7475 // - For GLM-style: may send complete usage in every chunk (need to replace, not
7576 // accumulate)
77+ // - For Google-style: cumulative usage in every chunk
7678 // - Cost-only events: have 0 tokens but a cost value
7779 if let Some ( current_usage) = message. usage . as_ref ( ) {
7880 // If current usage has both prompt and completion tokens, it's a "complete"
79- // usage In this case, replace instead of accumulate (handles
80- // GLM-style streaming)
81+ // usage. In this case, replace instead of merge (handles GLM-style streaming
82+ // where every chunk has full usage).
8183 let is_complete_usage =
8284 * current_usage. prompt_tokens > 0 && * current_usage. completion_tokens > 0 ;
8385
@@ -95,10 +97,19 @@ impl ResultStreamExt<anyhow::Error> for crate::BoxStream<ChatCompletionMessage,
9597 }
9698 } else if is_cost_only {
9799 // Accumulate only the cost to the existing usage
98- usage. cost = current_usage. cost ;
100+ usage. cost = match ( usage. cost , current_usage. cost ) {
101+ ( Some ( a) , Some ( b) ) => Some ( a + b) ,
102+ ( Some ( a) , None ) => Some ( a) ,
103+ ( None , Some ( b) ) => Some ( b) ,
104+ ( None , None ) => None ,
105+ } ;
99106 } else {
100- // Accumulate partial usage (for Anthropic-style streaming)
101- usage = usage. accumulate ( current_usage) ;
107+ // Merge partial usage using "max" strategy. This correctly handles
108+ // providers like Anthropic where usage values are CUMULATIVE across
109+ // events (message_start has input tokens, message_delta has the
110+ // total output tokens). Using max instead of sum prevents
111+ // double-counting when message_start includes output_tokens=1.
112+ usage = usage. merge ( current_usage) ;
102113 }
103114 }
104115
@@ -485,8 +496,73 @@ mod tests {
485496 }
486497
487498 #[ tokio:: test]
488- async fn test_into_full_anthropic_streaming_usage_accumulation ( ) {
499+ async fn test_into_full_anthropic_streaming_usage_merge ( ) {
500+ // Fixture: Simulate Anthropic streaming pattern where message_start has
501+ // output_tokens=1 (the common case) and message_delta has the cumulative total.
502+ // This tests that merge (max) is used instead of accumulate (sum) to prevent
503+ // double-counting.
504+ let messages = vec ! [
505+ // MessageStart with input token usage AND output_tokens=1
506+ Ok ( ChatCompletionMessage :: default ( ) . usage( Usage {
507+ prompt_tokens: TokenCount :: Actual ( 1000 ) ,
508+ completion_tokens: TokenCount :: Actual ( 1 ) ,
509+ total_tokens: TokenCount :: Actual ( 1001 ) ,
510+ cached_tokens: TokenCount :: Actual ( 300 ) ,
511+ cost: None ,
512+ } ) ) ,
513+ // Content deltas
514+ Ok ( ChatCompletionMessage :: default ( ) . content( Content :: part( "Hello " ) ) ) ,
515+ Ok ( ChatCompletionMessage :: default ( ) . content( Content :: part( "world!" ) ) ) ,
516+ // MessageDelta with cumulative output token usage
517+ Ok ( ChatCompletionMessage :: default ( )
518+ . usage( Usage {
519+ prompt_tokens: TokenCount :: Actual ( 0 ) ,
520+ completion_tokens: TokenCount :: Actual ( 50 ) ,
521+ total_tokens: TokenCount :: Actual ( 50 ) ,
522+ cached_tokens: TokenCount :: Actual ( 0 ) ,
523+ cost: None ,
524+ } )
525+ . finish_reason( FinishReason :: Stop ) ) ,
526+ ] ;
527+
528+ let result_stream: BoxStream < ChatCompletionMessage , anyhow:: Error > =
529+ Box :: pin ( tokio_stream:: iter ( messages) ) ;
530+
531+ // Actual: Convert stream to full message
532+ let actual = result_stream. into_full ( false ) . await . unwrap ( ) ;
533+
534+ // Expected: Usage should use max (merge) not sum (accumulate).
535+ // message_start has completion_tokens=1 and prompt_tokens=1000, so
536+ // is_complete_usage=true -> replace: usage = {1000, 1, 1001, 300}
537+ // message_delta has prompt=0, completion=50 -> is_complete_usage=false ->
538+ // merge: prompt = max(1000, 0) = 1000
539+ // completion = max(1, 50) = 50 (NOT 1+50=51)
540+ // total = max(1001, 50) = 1001
541+ // cached = max(300, 0) = 300
542+ let expected = ChatCompletionMessageFull {
543+ content : "Hello world!" . to_string ( ) ,
544+ tool_calls : vec ! [ ] ,
545+ thought_signature : None ,
546+ usage : Usage {
547+ prompt_tokens : TokenCount :: Actual ( 1000 ) ,
548+ completion_tokens : TokenCount :: Actual ( 50 ) , // max(1, 50) = 50, NOT 1+50=51
549+ total_tokens : TokenCount :: Actual ( 1001 ) ,
550+ cached_tokens : TokenCount :: Actual ( 300 ) ,
551+ cost : None ,
552+ } ,
553+ reasoning : None ,
554+ reasoning_details : None ,
555+ finish_reason : Some ( FinishReason :: Stop ) ,
556+ phase : None ,
557+ } ;
558+
559+ assert_eq ! ( actual, expected) ;
560+ }
561+
562+ #[ tokio:: test]
563+ async fn test_into_full_anthropic_streaming_usage_merge_zero_output ( ) {
489564 // Fixture: Simulate Anthropic/Vertex AI Anthropic streaming pattern
565+ // where message_start has output_tokens=0 (Vertex AI pattern).
490566 // MessageStart event has input tokens, MessageDelta has output tokens
491567 let messages = vec ! [
492568 // MessageStart with input token usage
@@ -518,15 +594,15 @@ mod tests {
518594 // Actual: Convert stream to full message
519595 let actual = result_stream. into_full ( false ) . await . unwrap ( ) ;
520596
521- // Expected: Usage should be accumulated from both MessageStart and MessageDelta
597+ // Expected: Usage should be merged from both MessageStart and MessageDelta
522598 let expected = ChatCompletionMessageFull {
523599 content : "Hello world!" . to_string ( ) ,
524600 tool_calls : vec ! [ ] ,
525601 thought_signature : None ,
526602 usage : Usage {
527603 prompt_tokens : TokenCount :: Actual ( 1000 ) , // From MessageStart
528604 completion_tokens : TokenCount :: Actual ( 50 ) , // From MessageDelta
529- total_tokens : TokenCount :: Actual ( 1050 ) , // Sum of both
605+ total_tokens : TokenCount :: Actual ( 1000 ) , // max(1000, 50) = 1000
530606 cached_tokens : TokenCount :: Actual ( 300 ) , // From MessageStart
531607 cost : None ,
532608 } ,
0 commit comments