// This code is based on the streaming example provided with whisper.cpp: // https://github.com/ggerganov/whisper.cpp/blob/ca21f7ab16694384fb74b1ba4f68b39f16540d23/examples/stream/stream.cpp #include "common.h" #include "common-sdl.h" #include "whisper.h" #include "stream.h" #include #include #include #include #include #include using unique_whisper = std::unique_ptr>; struct stream_context { stream_params params; std::unique_ptr audio; unique_whisper whisper; std::vector pcmf32; std::vector pcmf32_old; std::vector pcmf32_new; std::vector prompt_tokens; std::chrono::time_point t_last; std::chrono::time_point t_start; int n_samples_step; int n_samples_len; int n_samples_keep; bool use_vad; int n_new_line; int n_iter = 0; }; struct stream_params stream_default_params() { return stream_params { /* .n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()), /* .step_ms =*/ 3000, /* .length_ms =*/ 10000, /* .keep_ms =*/ 200, /* .capture_id =*/ -1, /* .max_tokens =*/ 32, /* .audio_ctx =*/ 0, /* .vad_thold =*/ 0.6f, /* .freq_thold =*/ 100.0f, /* .speed_up =*/ false, /* .translate =*/ false, /* .print_special =*/ false, /* .no_context =*/ true, /* .no_timestamps =*/ false, /* .language =*/ "en", /* .model =*/ "models/ggml-base.en.bin" }; } stream_context *stream_init(stream_params params) { auto ctx = std::make_unique(); params.keep_ms = std::min(params.keep_ms, params.step_ms); params.length_ms = std::max(params.length_ms, params.step_ms); ctx->n_samples_step = (1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE; ctx->n_samples_len = (1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE; ctx->n_samples_keep = (1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE; const int n_samples_30s = (1e-3 * 30000.0) * WHISPER_SAMPLE_RATE; ctx->use_vad = ctx->n_samples_step <= 0; // sliding window mode uses VAD ctx->n_new_line = !ctx->use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line params.no_timestamps = !ctx->use_vad; params.no_context |= ctx->use_vad; params.max_tokens = 0; // init audio ctx->audio = std::make_unique(params.length_ms); if (!ctx->audio->init(params.capture_id, WHISPER_SAMPLE_RATE)) { fprintf(stderr, "%s: audio.init() failed!\n", __func__); return NULL; } ctx->audio->resume(); // whisper init if (whisper_lang_id(params.language) == -1) { fprintf(stderr, "%s: unknown language '%s'\n", __func__, params.language); return NULL; } if ((ctx->whisper = unique_whisper(whisper_init_from_file(params.model))) == NULL) { return NULL; } ctx->pcmf32 = std::vector(n_samples_30s, 0.0f); ctx->pcmf32_new = std::vector(n_samples_30s, 0.0f); ctx->t_last = std::chrono::high_resolution_clock::now(); ctx->t_start = ctx->t_last; ctx->params = params; return ctx.release(); } void stream_free(stream_context *ctx) { ctx->audio = NULL; ctx->whisper = NULL; ctx->pcmf32.clear(); ctx->pcmf32_old.clear(); ctx->pcmf32_new.clear(); ctx->prompt_tokens.clear(); } int stream_run(stream_context *ctx, void *callback_ctx, stream_callback_t callback) { auto params = ctx->params; auto whisper = ctx->whisper.get(); auto t_now = std::chrono::high_resolution_clock::now(); if (!ctx->use_vad) { while (true) { ctx->audio->get(params.step_ms, ctx->pcmf32_new); if ((int)ctx->pcmf32_new.size() > 2 * ctx->n_samples_step) { fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); ctx->audio->clear(); continue; } if ((int)ctx->pcmf32_new.size() >= ctx->n_samples_step) { ctx->audio->clear(); break; } std::this_thread::sleep_for(std::chrono::milliseconds(1)); } const int n_samples_new = ctx->pcmf32_new.size(); // take up to params.length_ms audio from previous iteration const int n_samples_take = std::min((int)ctx->pcmf32_old.size(), std::max(0, ctx->n_samples_keep + ctx->n_samples_len - n_samples_new)); ctx->pcmf32.resize(n_samples_new + n_samples_take); for (int i = 0; i < n_samples_take; i++) { ctx->pcmf32[i] = ctx->pcmf32_old[ctx->pcmf32_old.size() - n_samples_take + i]; } memcpy(ctx->pcmf32.data() + n_samples_take, ctx->pcmf32_new.data(), n_samples_new * sizeof(float)); ctx->pcmf32_old = ctx->pcmf32; } else { auto t_diff = std::chrono::duration_cast(t_now - ctx->t_last).count(); if (t_diff < 2000) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); return 0; } // process new audio ctx->audio->get(2000, ctx->pcmf32_new); if (::vad_simple(ctx->pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { ctx->audio->get(params.length_ms, ctx->pcmf32); } else { std::this_thread::sleep_for(std::chrono::milliseconds(100)); return 0; } ctx->t_last = t_now; } // run the inference whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.print_progress = false; wparams.print_special = params.print_special; wparams.print_realtime = false; wparams.print_timestamps = !params.no_timestamps; wparams.translate = params.translate; wparams.no_context = true; wparams.single_segment = !ctx->use_vad; wparams.max_tokens = params.max_tokens; wparams.language = params.language; wparams.n_threads = params.n_threads; wparams.audio_ctx = params.audio_ctx; wparams.speed_up = params.speed_up; // disable temperature fallback wparams.temperature_inc = -1.0f; wparams.prompt_tokens = params.no_context ? nullptr : ctx->prompt_tokens.data(); wparams.prompt_n_tokens = params.no_context ? 0 : ctx->prompt_tokens.size(); const int64_t t1 = (t_now - ctx->t_start).count() / 1000000; const int64_t t0 = std::max(0.0, t1 - ctx->pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE); if (whisper_full(whisper, wparams, ctx->pcmf32.data(), ctx->pcmf32.size()) != 0) { fprintf(stderr, "%s: failed to process audio\n", __func__); return 6; } const int n_segments = whisper_full_n_segments(whisper); for (int i = 0; i < n_segments; ++i) { const char *text = whisper_full_get_segment_text(whisper, i); const int64_t segment_t0 = whisper_full_get_segment_t0(whisper, i); const int64_t segment_t1 = whisper_full_get_segment_t1(whisper, i); callback(text, ctx->use_vad ? segment_t0 : t0, ctx->use_vad ? segment_t1 : t1, callback_ctx); } ++ctx->n_iter; if (!ctx->use_vad && (ctx->n_iter % ctx->n_new_line) == 0) { callback(NULL, 0, 0, callback_ctx); // keep part of the audio for next iteration to try to mitigate word boundary issues ctx->pcmf32_old = std::vector(ctx->pcmf32.end() - ctx->n_samples_keep, ctx->pcmf32.end()); // Add tokens of the last full length segment as the prompt if (!params.no_context) { ctx->prompt_tokens.clear(); const int n_segments = whisper_full_n_segments(whisper); for (int i = 0; i < n_segments; ++i) { const int token_count = whisper_full_n_tokens(whisper, i); for (int j = 0; j < token_count; ++j) { ctx->prompt_tokens.push_back(whisper_full_get_token_id(whisper, i, j)); } } } } return 0; }