Perfect Speech to Text in Emacs
This code requires:
sox - for audio recording
whisper.cpp - for speech-to-text transcription
The code expects:
sox binary in /bin/sh path
whisper.cpp compiled in ~/whisper.cpp/build/bin/whisper-cli
Whisper model file at ~/whisper.cpp/models/ggml-base.en.bin
On Linux Mint, install sox with:
sudo apt install sox
For whisper.cpp, you'll need to clone, compile, and download a model from the whisper.cpp GitHub repository.
(defun run-whisper-stt () "Record audio and transcribe it using Whisper, inserting text at cursor position." (interactive) (let* ((original-buf (current-buffer)) (original-point (point-marker)) ; Marker tracks position even if buffer changes (wav-file "/tmp/whisper-recording.wav") (temp-buf (generate-new-buffer " *Whisper Temp*"))) ;; Start recording audio (start-process "record-audio" nil "/bin/sh" "-c" (format "sox -d -r 16000 -c 1 -b 16 %s --no-show-progress 2>/dev/null" wav-file)) ;; Inform user recording has started (message "Recording started. Press C-g to stop.") ;; Wait for user to stop (C-g) (condition-case nil (while t (sit-for 1)) (quit (interrupt-process "record-audio"))) ;; Run Whisper STT (let ((proc (start-process "whisper-stt" temp-buf "/bin/sh" "-c" (format "~/whisper.cpp/build/bin/whisper-cli -m ~/whisper.cpp/models/ggml-base.en.bin -f %s -nt -np 2>/dev/null" wav-file)))) ;; Properly capture `temp-buf` using a lambda (set-process-sentinel proc `(lambda (proc event) (when (string= event "finished\n") (when (buffer-live-p ,temp-buf) (let* ((output (string-trim (with-current-buffer ,temp-buf (buffer-string))))) ;; Trim excess whitespace (when (buffer-live-p ,original-buf) (with-current-buffer ,original-buf (goto-char ,original-point) (insert output " ") ;; Insert text with a single space after (goto-char (point))))) ;; Move cursor to end of inserted text ;; Clean up temporary buffer (kill-buffer ,temp-buf)))))))) (global-set-key (kbd "C-c v") 'run-whisper-stt)