heyito · JohnDonavon · Oct 15, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 15, 2025
diff --git a/app/components/home/contents/HomeContent.tsx b/app/components/home/contents/HomeContent.tsx
@@ -296,6 +296,16 @@ export default function HomeContent() {
   const getDisplayText = (interaction: Interaction) => {
     // Check for errors first
     if (interaction.asr_output?.error) {
+      // Prefer precise error code mapping when available
+      const code = interaction.asr_output?.errorCode
+      if (code === 'CLIENT_TRANSCRIPTION_QUALITY_ERROR') {
+        return {
+          text: 'Audio quality too low',
+          isError: true,
+          tooltip:
+            'Audio quality was too low to generate a reliable transcript',
+        }
+      }
       if (
         interaction.asr_output.error.includes('No speech detected in audio.') ||
         interaction.asr_output.error.includes('Unable to transcribe audio.')

diff --git a/lib/clients/grpcClient.ts b/lib/clients/grpcClient.ts
@@ -50,6 +50,10 @@ class GrpcClient {
       baseUrl: import.meta.env.VITE_GRPC_BASE_URL,
       httpVersion: '1.1',
     })
+    console.log(
+      'Creating gRPC client with base URL:',
+      import.meta.env.VITE_GRPC_BASE_URL,
+    )
     this.client = createClient(ItoService, transport)
   }
 

diff --git a/lib/constants/generated-defaults.ts b/lib/constants/generated-defaults.ts
@@ -23,7 +23,7 @@ export const DEFAULT_ADVANCED_SETTINGS = {
 - Resolve corrections smoothly: when the speaker self-corrects ("let's do next week... no, next month"), choose the final phrasing.
 - Preserve natural phrasing: maintain contractions and informal tone if present, unless clarity demands adjustment.
 - Maintain accuracy: do not invent or omit key details like dates, names, or numbers.
-- Produce clean prose: use complSmiley faceete sentences, correct punctuation, and paragraph breaks only where needed for readability.
+- Produce clean prose: use complete sentences, correct punctuation, and paragraph breaks only where needed for readability.
 - Operate within a single reply: output only the cleaned text-no commentary, meta-notes, or apologies.
 
 Example
@@ -50,5 +50,5 @@ When you receive a transcript, immediately return the polished version following
 
   // Audio quality thresholds
   noSpeechThreshold: 0.6,
-  lowQualityThreshold: -0.55,
+  lowQualityThreshold: -0.75,
 } as const
diff --git a/lib/main/interactions/InteractionManager.ts b/lib/main/interactions/InteractionManager.ts
@@ -33,6 +33,7 @@ export class InteractionManager {
     audioBuffer: Buffer,
     sampleRate: number,
     errorMessage?: string,
+    errorCode?: string,
   ) {
     if (!this.currentInteractionId) {
       log.warn(
@@ -63,6 +64,7 @@ export class InteractionManager {
         transcript,
         totalAudioBytes: audioBuffer.length,
         error: errorMessage || null,
+        errorCode: errorCode || null,
         timestamp: new Date().toISOString(),
         durationMs,
       }

diff --git a/lib/main/transcriptionService.test.ts b/lib/main/transcriptionService.test.ts
@@ -203,6 +203,7 @@ describe('TranscriptionService', () => {
       Buffer.from('audio-data'),
       16000,
       undefined,
+      undefined,
     )
     expect(mockWindowMessenger.sendTranscriptionResult).toHaveBeenCalledWith({
       transcript: mockTranscript,

diff --git a/lib/main/transcriptionService.ts b/lib/main/transcriptionService.ts
@@ -96,6 +96,7 @@ export class TranscriptionService {
     })
 
     const errorMessage = response.error ? response.error.message : undefined
+    const errorCode = response.error ? response.error.code : undefined
 
     // Handle any transcription error
     if (response.error) {
@@ -105,6 +106,7 @@ export class TranscriptionService {
         this.audioStreamManager.getInteractionAudioBuffer(),
         this.audioStreamManager.getCurrentSampleRate(),
         errorMessage,
+        errorCode,
       )
 
       this.audioStreamManager.clearInteractionAudio()
@@ -142,6 +144,7 @@ export class TranscriptionService {
           this.audioStreamManager.getInteractionAudioBuffer(),
           this.audioStreamManager.getCurrentSampleRate(),
           errorMessage,
+          errorCode,
         )
       }
 

diff --git a/server/src/clients/groqClient.ts b/server/src/clients/groqClient.ts
@@ -89,6 +89,23 @@ class GroqClient implements LlmProvider {
     }
   }
 
+  /**
+   * Calculate a robust average log probability across all segments.
+   * Uses the median of available avg_logprob values to reduce outlier impact.
+   */
+  private calcAvgLogprob(segments: any[]): number | null {
+    if (!Array.isArray(segments) || segments.length === 0) return null
+    const values = segments
+      .map(s => s?.avg_logprob)
+      .filter((v: any) => typeof v === 'number' && isFinite(v)) as number[]
+    if (values.length === 0) return null
+    values.sort((a, b) => a - b)
+    const mid = Math.floor(values.length / 2)
+    return values.length % 2 === 0
+      ? (values[mid - 1] + values[mid]) / 2
+      : values[mid]
+  }
+
   /**
    * Transcribes an audio buffer using the Groq API.
    * @param audioBuffer The audio data as a Node.js Buffer.
@@ -99,6 +116,7 @@ class GroqClient implements LlmProvider {
     audioBuffer: Buffer,
     options?: TranscriptionOptions,
   ): Promise<string> {
+    console.log('Transcribing audio with options:', options)
     const fileType = options?.fileType || 'webm'
     const asrModel = options?.asrModel
     const vocabulary = options?.vocabulary
@@ -137,18 +155,24 @@ class GroqClient implements LlmProvider {
 
       const segments = (transcription as any).segments
       if (segments && segments.length > 0) {
-        const segment = segments[0]
-        if (segment.no_speech_prob > noSpeechThreshold) {
+        const first = segments[0]
+        if (first?.no_speech_prob > noSpeechThreshold) {
+          console.log('No speech probability:', first.no_speech_prob)
           throw new ClientNoSpeechError(
             ClientProvider.GROQ,
-            segment.no_speech_prob,
-          )
-        } else if (segment.avg_logprob < lowQualityThreshold) {
-          throw new ClientTranscriptionQualityError(
-            ClientProvider.GROQ,
-            segment.avg_logprob,
+            first.no_speech_prob,
           )
         }
+        const robustAvg = this.calcAvgLogprob(segments)
+        if (typeof robustAvg === 'number') {
+          if (robustAvg < lowQualityThreshold) {
+            console.log('Low quality probability (robust avg):', robustAvg)
+            throw new ClientTranscriptionQualityError(
+              ClientProvider.GROQ,
+              robustAvg,
+            )
+          }
+        }
       }
 
       // The Node SDK returns the full object, the text is in the `text` property.

diff --git a/server/src/constants/generated-defaults.ts b/server/src/constants/generated-defaults.ts
@@ -4,8 +4,6 @@
  * Run 'bun generate:constants' to regenerate
  */
 
-import { START_CONTEXT_MARKER, END_CONTEXT_MARKER } from './markers.js'
-
 export const DEFAULT_ADVANCED_SETTINGS = {
   // ASR (Automatic Speech Recognition) settings
   asrProvider: 'groq',
@@ -37,27 +35,20 @@ Cleaned output:
 
 When you receive a transcript, immediately return the polished version following these rules.
 `,
-  editingPrompt: `
-You are an AI assistant helping to edit documents based on user commands. These documents may be emails, notes, or any other text-based content in any application. You will be given the current document content (marked by {START_CONTEXT_MARKER} and {END_CONTEXT_MARKER}) and a user command (marked by {USER_COMMAND_MARKER}). 
-The document may be empty.
-
-IMPORTANT: Your response MUST contain ONLY the modified document text that should replace the original content. DO NOT include:
-- Any markers like ${START_CONTEXT_MARKER} or ${END_CONTEXT_MARKER}
-- Any explanations, apologies, or additional text
-- Any formatting markers like ---
-
-FORMATTING RULES:
-1. Use proper formatting:
-  - Use actual line breaks, not spaces
-  - For bullet points, use "- " at the start of lines
-  - Maintain consistent indentation
-
-For example, if you're editing an email, only return the email text itself, with all formatting preserved. If you're editing a document, only return the document content with exact formatting. The application will handle the context.
-
-Your response should start with the very first character of the modified content and end with the very last character.
+  editingPrompt: ` You are a Command-Interpreter assistant. Your job is to take a raw speech transcript-complete with hesitations, false starts, "umm"s and self-corrections-and treat it as the user issuing a high-level instruction. Instead of merely polishing their words, you must:
+    1.	Extract the intent: identify the action the user is asking for (e.g. "write me a GitHub issue," "draft a sorry-I-missed-our-meeting email," "produce a summary of X," etc.).
+    2.	Ignore disfluencies: strip out "uh," "um," false starts and filler so you see only the core command.
+    3.	Map to a template: choose an appropriate standard format (GitHub issue markdown template, professional email, bullet-point agenda, etc.) that matches the intent.
+    4.	Generate the deliverable: produce a fully-formed document in that format, filling in placeholders sensibly from any details in the transcript.
+    5.	Do not add new intent: if the transcript doesn't specify something (e.g. title, recipients, date), use reasonable defaults (e.g. "Untitled Issue," "To: [Recipient]") or prompt the user for the missing piece.
+    6.	Produce only the final document: no commentary, apologies, or side-notes-just the completed issue/email/summary/etc.
+    7. Your response MUST contain ONLY the resultant text. DO NOT include:
+      - Any markers like [START/END CURRENT NOTES CONTENT]
+      - Any explanations, apologies, or additional text
+      - Any formatting markers like --- or \`\`\`
   `,
 
   // Audio quality thresholds
   noSpeechThreshold: 0.6,
-  lowQualityThreshold: -0.55,
+  lowQualityThreshold: -0.75,
 } as const
diff --git a/server/src/services/ito/itoService.ts b/server/src/services/ito/itoService.ts
@@ -45,6 +45,7 @@ import {
   createUserPromptWithContext,
 } from './helpers.js'
 import { ITO_MODE_SYSTEM_PROMPT } from './constants.js'
+import { enhancePcm16 } from '../../utils/audio.js'
 
 /**
  * --- NEW: WAV Header Generation Function ---
@@ -220,14 +221,15 @@ export default (router: ConnectRouter) => {
         const bitDepth = 16
         const channels = 1 // Mono
 
-        // 2. Create the header with the correct properties.
+        // 2. Enhance the PCM and create the header with the correct properties.
+        const enhancedPcm = enhancePcm16(Buffer.from(fullAudio), sampleRate)
         const wavHeader = createWavHeader(
-          fullAudio.length,
+          enhancedPcm.length,
           sampleRate,
           channels,
           bitDepth,
         )
-        const fullAudioWAV = Buffer.concat([wavHeader, fullAudio])
+        const fullAudioWAV = Buffer.concat([wavHeader, enhancedPcm])
 
         // 3. Extract and validate vocabulary from gRPC metadata
         const vocabularyHeader = context.requestHeader.get('vocabulary')

diff --git a/server/src/utils/audio.ts b/server/src/utils/audio.ts
@@ -0,0 +1,69 @@
+/**
+ * Light audio enhancement for 16-bit PCM mono at a given sample rate.
+ * - Removes DC offset
+ * - Applies a gentle high-pass filter (~80 Hz)
+ * - Peak normalizes to ~-3 dBFS with a capped gain
+ */
+export function enhancePcm16(pcm: Buffer, sampleRate: number): Buffer {
+  if (!pcm || pcm.length < 2) return pcm
+
+  const sampleCount = Math.floor(pcm.length / 2)
+  if (sampleCount <= 0) return pcm
+
+  // Read int16 samples
+  const samples = new Int16Array(sampleCount)
+  for (let i = 0; i < sampleCount; i++) {
+    samples[i] = pcm.readInt16LE(i * 2)
+  }
+
+  // DC offset removal
+  let sum = 0
+  for (let i = 0; i < sampleCount; i++) sum += samples[i]
+  const mean = Math.trunc(sum / sampleCount)
+  if (mean !== 0) {
+    for (let i = 0; i < sampleCount; i++) {
+      samples[i] = (samples[i] - mean) as unknown as Int16Array[number]
+    }
+  }
+
+  // Gentle high-pass filter (~80 Hz)
+  const fc = 80
+  const a = Math.exp((-2 * Math.PI * fc) / sampleRate)
+  let prevX = 0
+  let prevY = 0
+  const filtered = new Float32Array(sampleCount)
+  for (let i = 0; i < sampleCount; i++) {
+    const x = samples[i]
+    const y = a * (prevY + x - prevX)
+    filtered[i] = y
+    prevX = x
+    prevY = y
+  }
+
+  // Peak normalize to ~-3 dBFS, cap max gain to ~+12 dB
+  let peak = 1
+  for (let i = 0; i < sampleCount; i++) {
+    const v = Math.abs(filtered[i])
+    if (v > peak) peak = v
+  }
+  const target = 0.707 * 32767 // ≈ -3 dBFS
+  const rawGain = target / peak
+  const gain = Math.min(rawGain, 4.0)
+
+  const out = Buffer.alloc(sampleCount * 2)
+  if (gain > 1.05) {
+    for (let i = 0; i < sampleCount; i++) {
+      const v = Math.round(filtered[i] * gain)
+      const clamped = Math.max(-32768, Math.min(32767, v))
+      out.writeInt16LE(clamped, i * 2)
+    }
+  } else {
+    for (let i = 0; i < sampleCount; i++) {
+      const v = Math.round(filtered[i])
+      const clamped = Math.max(-32768, Math.min(32767, v))
+      out.writeInt16LE(clamped, i * 2)
+    }
+  }
+
+  return out
+}
diff --git a/shared-constants.js b/shared-constants.js
@@ -49,7 +49,7 @@ When you receive a transcript, immediately return the polished version following
 
   // Audio quality thresholds
   noSpeechThreshold: 0.6,
-  lowQualityThreshold: -0.55,
+  lowQualityThreshold: -0.75,
 }
 
 module.exports = { DEFAULT_ADVANCED_SETTINGS }