clawd/scripts/transcribe-audio.sh

48 lines
1.1 KiB
Bash
Executable File

#!/bin/bash
# Transcribe audio using Groq's free Whisper API
# Usage: transcribe-audio.sh <audio_file>
set -e
AUDIO_FILE="$1"
GROQ_API_KEY="${GROQ_API_KEY:-}"
if [ -z "$AUDIO_FILE" ]; then
echo "Usage: $0 <audio_file>" >&2
exit 1
fi
if [ ! -f "$AUDIO_FILE" ]; then
echo "Error: File not found: $AUDIO_FILE" >&2
exit 1
fi
if [ -z "$GROQ_API_KEY" ]; then
echo "Error: GROQ_API_KEY not set" >&2
exit 1
fi
# Convert ogg to mp3 if needed (Groq prefers standard formats)
TEMP_FILE=""
if [[ "$AUDIO_FILE" == *.ogg ]]; then
TEMP_FILE=$(mktemp --suffix=.mp3)
ffmpeg -y -i "$AUDIO_FILE" -acodec libmp3lame -q:a 2 "$TEMP_FILE" 2>/dev/null
AUDIO_FILE="$TEMP_FILE"
fi
# Call Groq Whisper API
RESPONSE=$(curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
-H "Authorization: Bearer $GROQ_API_KEY" \
-H "Content-Type: multipart/form-data" \
-F "file=@$AUDIO_FILE" \
-F "model=whisper-large-v3" \
-F "response_format=json")
# Clean up temp file
if [ -n "$TEMP_FILE" ]; then
rm -f "$TEMP_FILE"
fi
# Extract text from response
echo "$RESPONSE" | jq -r '.text // empty'