Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions app/components/home/contents/HomeContent.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,16 @@ export default function HomeContent() {
const getDisplayText = (interaction: Interaction) => {
// Check for errors first
if (interaction.asr_output?.error) {
// Prefer precise error code mapping when available
const code = interaction.asr_output?.errorCode
if (code === 'CLIENT_TRANSCRIPTION_QUALITY_ERROR') {
return {
text: 'Audio quality too low',
isError: true,
tooltip:
'Audio quality was too low to generate a reliable transcript',
}
}
if (
interaction.asr_output.error.includes('No speech detected in audio.') ||
interaction.asr_output.error.includes('Unable to transcribe audio.')
Expand Down
4 changes: 4 additions & 0 deletions lib/clients/grpcClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ class GrpcClient {
baseUrl: import.meta.env.VITE_GRPC_BASE_URL,
httpVersion: '1.1',
})
console.log(
'Creating gRPC client with base URL:',
import.meta.env.VITE_GRPC_BASE_URL,
)
this.client = createClient(ItoService, transport)
}

Expand Down
4 changes: 2 additions & 2 deletions lib/constants/generated-defaults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ export const DEFAULT_ADVANCED_SETTINGS = {
- Resolve corrections smoothly: when the speaker self-corrects ("let's do next week... no, next month"), choose the final phrasing.
- Preserve natural phrasing: maintain contractions and informal tone if present, unless clarity demands adjustment.
- Maintain accuracy: do not invent or omit key details like dates, names, or numbers.
- Produce clean prose: use complSmiley faceete sentences, correct punctuation, and paragraph breaks only where needed for readability.
- Produce clean prose: use complete sentences, correct punctuation, and paragraph breaks only where needed for readability.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't we fix this before?

- Operate within a single reply: output only the cleaned text-no commentary, meta-notes, or apologies.

Example
Expand All @@ -50,5 +50,5 @@ When you receive a transcript, immediately return the polished version following

// Audio quality thresholds
noSpeechThreshold: 0.6,
lowQualityThreshold: -0.55,
lowQualityThreshold: -0.75,
} as const
2 changes: 2 additions & 0 deletions lib/main/interactions/InteractionManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export class InteractionManager {
audioBuffer: Buffer,
sampleRate: number,
errorMessage?: string,
errorCode?: string,
) {
if (!this.currentInteractionId) {
log.warn(
Expand Down Expand Up @@ -63,6 +64,7 @@ export class InteractionManager {
transcript,
totalAudioBytes: audioBuffer.length,
error: errorMessage || null,
errorCode: errorCode || null,
timestamp: new Date().toISOString(),
durationMs,
}
Expand Down
1 change: 1 addition & 0 deletions lib/main/transcriptionService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ describe('TranscriptionService', () => {
Buffer.from('audio-data'),
16000,
undefined,
undefined,
)
expect(mockWindowMessenger.sendTranscriptionResult).toHaveBeenCalledWith({
transcript: mockTranscript,
Expand Down
3 changes: 3 additions & 0 deletions lib/main/transcriptionService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ export class TranscriptionService {
})

const errorMessage = response.error ? response.error.message : undefined
const errorCode = response.error ? response.error.code : undefined

// Handle any transcription error
if (response.error) {
Expand All @@ -105,6 +106,7 @@ export class TranscriptionService {
this.audioStreamManager.getInteractionAudioBuffer(),
this.audioStreamManager.getCurrentSampleRate(),
errorMessage,
errorCode,
)

this.audioStreamManager.clearInteractionAudio()
Expand Down Expand Up @@ -142,6 +144,7 @@ export class TranscriptionService {
this.audioStreamManager.getInteractionAudioBuffer(),
this.audioStreamManager.getCurrentSampleRate(),
errorMessage,
errorCode,
)
}

Expand Down
40 changes: 32 additions & 8 deletions server/src/clients/groqClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,23 @@ class GroqClient implements LlmProvider {
}
}

/**
* Calculate a robust average log probability across all segments.
* Uses the median of available avg_logprob values to reduce outlier impact.
*/
private calcAvgLogprob(segments: any[]): number | null {
if (!Array.isArray(segments) || segments.length === 0) return null
const values = segments
.map(s => s?.avg_logprob)
.filter((v: any) => typeof v === 'number' && isFinite(v)) as number[]
if (values.length === 0) return null
values.sort((a, b) => a - b)
const mid = Math.floor(values.length / 2)
return values.length % 2 === 0
? (values[mid - 1] + values[mid]) / 2
: values[mid]
}

/**
* Transcribes an audio buffer using the Groq API.
* @param audioBuffer The audio data as a Node.js Buffer.
Expand All @@ -99,6 +116,7 @@ class GroqClient implements LlmProvider {
audioBuffer: Buffer,
options?: TranscriptionOptions,
): Promise<string> {
console.log('Transcribing audio with options:', options)
const fileType = options?.fileType || 'webm'
const asrModel = options?.asrModel
const vocabulary = options?.vocabulary
Expand Down Expand Up @@ -137,18 +155,24 @@ class GroqClient implements LlmProvider {

const segments = (transcription as any).segments
if (segments && segments.length > 0) {
const segment = segments[0]
if (segment.no_speech_prob > noSpeechThreshold) {
const first = segments[0]
if (first?.no_speech_prob > noSpeechThreshold) {
console.log('No speech probability:', first.no_speech_prob)
throw new ClientNoSpeechError(
ClientProvider.GROQ,
segment.no_speech_prob,
)
} else if (segment.avg_logprob < lowQualityThreshold) {
throw new ClientTranscriptionQualityError(
ClientProvider.GROQ,
segment.avg_logprob,
first.no_speech_prob,
)
}
const robustAvg = this.calcAvgLogprob(segments)
if (typeof robustAvg === 'number') {
if (robustAvg < lowQualityThreshold) {
console.log('Low quality probability (robust avg):', robustAvg)
throw new ClientTranscriptionQualityError(
ClientProvider.GROQ,
robustAvg,
)
}
}
}

// The Node SDK returns the full object, the text is in the `text` property.
Expand Down
33 changes: 12 additions & 21 deletions server/src/constants/generated-defaults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
* Run 'bun generate:constants' to regenerate
*/

import { START_CONTEXT_MARKER, END_CONTEXT_MARKER } from './markers.js'

export const DEFAULT_ADVANCED_SETTINGS = {
// ASR (Automatic Speech Recognition) settings
asrProvider: 'groq',
Expand Down Expand Up @@ -37,27 +35,20 @@ Cleaned output:

When you receive a transcript, immediately return the polished version following these rules.
`,
editingPrompt: `
You are an AI assistant helping to edit documents based on user commands. These documents may be emails, notes, or any other text-based content in any application. You will be given the current document content (marked by {START_CONTEXT_MARKER} and {END_CONTEXT_MARKER}) and a user command (marked by {USER_COMMAND_MARKER}).
The document may be empty.

IMPORTANT: Your response MUST contain ONLY the modified document text that should replace the original content. DO NOT include:
- Any markers like ${START_CONTEXT_MARKER} or ${END_CONTEXT_MARKER}
- Any explanations, apologies, or additional text
- Any formatting markers like ---

FORMATTING RULES:
1. Use proper formatting:
- Use actual line breaks, not spaces
- For bullet points, use "- " at the start of lines
- Maintain consistent indentation

For example, if you're editing an email, only return the email text itself, with all formatting preserved. If you're editing a document, only return the document content with exact formatting. The application will handle the context.

Your response should start with the very first character of the modified content and end with the very last character.
editingPrompt: ` You are a Command-Interpreter assistant. Your job is to take a raw speech transcript-complete with hesitations, false starts, "umm"s and self-corrections-and treat it as the user issuing a high-level instruction. Instead of merely polishing their words, you must:
1. Extract the intent: identify the action the user is asking for (e.g. "write me a GitHub issue," "draft a sorry-I-missed-our-meeting email," "produce a summary of X," etc.).
2. Ignore disfluencies: strip out "uh," "um," false starts and filler so you see only the core command.
3. Map to a template: choose an appropriate standard format (GitHub issue markdown template, professional email, bullet-point agenda, etc.) that matches the intent.
4. Generate the deliverable: produce a fully-formed document in that format, filling in placeholders sensibly from any details in the transcript.
5. Do not add new intent: if the transcript doesn't specify something (e.g. title, recipients, date), use reasonable defaults (e.g. "Untitled Issue," "To: [Recipient]") or prompt the user for the missing piece.
6. Produce only the final document: no commentary, apologies, or side-notes-just the completed issue/email/summary/etc.
7. Your response MUST contain ONLY the resultant text. DO NOT include:
- Any markers like [START/END CURRENT NOTES CONTENT]
- Any explanations, apologies, or additional text
- Any formatting markers like --- or \`\`\`
Comment on lines +38 to +48
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to try to keep the prompt from leaking?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was just the result of running bun generate:constants

`,

// Audio quality thresholds
noSpeechThreshold: 0.6,
lowQualityThreshold: -0.55,
lowQualityThreshold: -0.75,
} as const
8 changes: 5 additions & 3 deletions server/src/services/ito/itoService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import {
createUserPromptWithContext,
} from './helpers.js'
import { ITO_MODE_SYSTEM_PROMPT } from './constants.js'
import { enhancePcm16 } from '../../utils/audio.js'

/**
* --- NEW: WAV Header Generation Function ---
Expand Down Expand Up @@ -220,14 +221,15 @@ export default (router: ConnectRouter) => {
const bitDepth = 16
const channels = 1 // Mono

// 2. Create the header with the correct properties.
// 2. Enhance the PCM and create the header with the correct properties.
const enhancedPcm = enhancePcm16(Buffer.from(fullAudio), sampleRate)
const wavHeader = createWavHeader(
fullAudio.length,
enhancedPcm.length,
sampleRate,
channels,
bitDepth,
)
const fullAudioWAV = Buffer.concat([wavHeader, fullAudio])
const fullAudioWAV = Buffer.concat([wavHeader, enhancedPcm])

// 3. Extract and validate vocabulary from gRPC metadata
const vocabularyHeader = context.requestHeader.get('vocabulary')
Expand Down
69 changes: 69 additions & 0 deletions server/src/utils/audio.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Light audio enhancement for 16-bit PCM mono at a given sample rate.
* - Removes DC offset
* - Applies a gentle high-pass filter (~80 Hz)
* - Peak normalizes to ~-3 dBFS with a capped gain
*/
export function enhancePcm16(pcm: Buffer, sampleRate: number): Buffer {
if (!pcm || pcm.length < 2) return pcm

const sampleCount = Math.floor(pcm.length / 2)
if (sampleCount <= 0) return pcm

// Read int16 samples
const samples = new Int16Array(sampleCount)
for (let i = 0; i < sampleCount; i++) {
samples[i] = pcm.readInt16LE(i * 2)
}

// DC offset removal
let sum = 0
for (let i = 0; i < sampleCount; i++) sum += samples[i]
const mean = Math.trunc(sum / sampleCount)
if (mean !== 0) {
for (let i = 0; i < sampleCount; i++) {
samples[i] = (samples[i] - mean) as unknown as Int16Array[number]
}
}

// Gentle high-pass filter (~80 Hz)
const fc = 80
const a = Math.exp((-2 * Math.PI * fc) / sampleRate)
let prevX = 0
let prevY = 0
const filtered = new Float32Array(sampleCount)
for (let i = 0; i < sampleCount; i++) {
const x = samples[i]
const y = a * (prevY + x - prevX)
filtered[i] = y
prevX = x
prevY = y
}

// Peak normalize to ~-3 dBFS, cap max gain to ~+12 dB
let peak = 1
for (let i = 0; i < sampleCount; i++) {
const v = Math.abs(filtered[i])
if (v > peak) peak = v
}
const target = 0.707 * 32767 // ≈ -3 dBFS
const rawGain = target / peak
const gain = Math.min(rawGain, 4.0)

const out = Buffer.alloc(sampleCount * 2)
if (gain > 1.05) {
for (let i = 0; i < sampleCount; i++) {
const v = Math.round(filtered[i] * gain)
const clamped = Math.max(-32768, Math.min(32767, v))
out.writeInt16LE(clamped, i * 2)
}
} else {
for (let i = 0; i < sampleCount; i++) {
const v = Math.round(filtered[i])
const clamped = Math.max(-32768, Math.min(32767, v))
out.writeInt16LE(clamped, i * 2)
}
}

return out
}
2 changes: 1 addition & 1 deletion shared-constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ When you receive a transcript, immediately return the polished version following

// Audio quality thresholds
noSpeechThreshold: 0.6,
lowQualityThreshold: -0.55,
lowQualityThreshold: -0.75,
}

module.exports = { DEFAULT_ADVANCED_SETTINGS }