initial commit
This commit is contained in:
43
LibWhisper/CaptureDevice.swift
Normal file
43
LibWhisper/CaptureDevice.swift
Normal file
@ -0,0 +1,43 @@
|
||||
public enum CaptureDeviceError: Error {
|
||||
case sdlErrorCode(Int32)
|
||||
}
|
||||
|
||||
public struct CaptureDevice {
|
||||
public let id: Int32
|
||||
public let name: String
|
||||
|
||||
public init(id: Int32, name: String) {
|
||||
self.id = id
|
||||
self.name = name
|
||||
}
|
||||
|
||||
public static var devices: [CaptureDevice] {
|
||||
get throws {
|
||||
var devices = [CaptureDevice]()
|
||||
|
||||
let result = SDL_Init(SDL_INIT_AUDIO)
|
||||
if result < 0 {
|
||||
throw CaptureDeviceError.sdlErrorCode(result)
|
||||
}
|
||||
|
||||
for i in 0..<SDL_GetNumAudioDevices(1) {
|
||||
let name = String(cString: SDL_GetAudioDeviceName(i, 1))
|
||||
devices.append(CaptureDevice(id: i, name: name))
|
||||
}
|
||||
|
||||
return devices
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension CaptureDevice: Equatable {
|
||||
public static func == (lhs: Self, rhs: Self) -> Bool {
|
||||
return lhs.id == rhs.id
|
||||
}
|
||||
}
|
||||
|
||||
extension CaptureDevice: Hashable {
|
||||
public func hash(into hasher: inout Hasher) {
|
||||
hasher.combine(id)
|
||||
}
|
||||
}
|
||||
15
LibWhisper/LibWhisper.h
Normal file
15
LibWhisper/LibWhisper.h
Normal file
@ -0,0 +1,15 @@
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
//! Project version number for LibWhisper.
|
||||
FOUNDATION_EXPORT double LibWhisperVersionNumber;
|
||||
|
||||
//! Project version string for LibWhisper.
|
||||
FOUNDATION_EXPORT const unsigned char LibWhisperVersionString[];
|
||||
|
||||
// SDL functions used in CaptureDevice
|
||||
#define SDL_INIT_AUDIO 0x00000010u
|
||||
extern int SDL_Init(uint32_t flags);
|
||||
extern int SDL_GetNumAudioDevices(int iscapture);
|
||||
extern const char * SDL_GetAudioDeviceName(int index, int iscapture);
|
||||
|
||||
#import "stream.h"
|
||||
233
LibWhisper/SDL.h
Normal file
233
LibWhisper/SDL.h
Normal file
@ -0,0 +1,233 @@
|
||||
/*
|
||||
Simple DirectMedia Layer
|
||||
Copyright (C) 1997-2023 Sam Lantinga <slouken@libsdl.org>
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file SDL.h
|
||||
*
|
||||
* Main include header for the SDL library
|
||||
*/
|
||||
|
||||
|
||||
#ifndef SDL_h_
|
||||
#define SDL_h_
|
||||
|
||||
#include "SDL_main.h"
|
||||
#include "SDL_stdinc.h"
|
||||
#include "SDL_assert.h"
|
||||
#include "SDL_atomic.h"
|
||||
#include "SDL_audio.h"
|
||||
#include "SDL_clipboard.h"
|
||||
#include "SDL_cpuinfo.h"
|
||||
#include "SDL_endian.h"
|
||||
#include "SDL_error.h"
|
||||
#include "SDL_events.h"
|
||||
#include "SDL_filesystem.h"
|
||||
#include "SDL_gamecontroller.h"
|
||||
#include "SDL_guid.h"
|
||||
#include "SDL_haptic.h"
|
||||
#include "SDL_hidapi.h"
|
||||
#include "SDL_hints.h"
|
||||
#include "SDL_joystick.h"
|
||||
#include "SDL_loadso.h"
|
||||
#include "SDL_log.h"
|
||||
#include "SDL_messagebox.h"
|
||||
#include "SDL_metal.h"
|
||||
#include "SDL_mutex.h"
|
||||
#include "SDL_power.h"
|
||||
#include "SDL_render.h"
|
||||
#include "SDL_rwops.h"
|
||||
#include "SDL_sensor.h"
|
||||
#include "SDL_shape.h"
|
||||
#include "SDL_system.h"
|
||||
#include "SDL_thread.h"
|
||||
#include "SDL_timer.h"
|
||||
#include "SDL_version.h"
|
||||
#include "SDL_video.h"
|
||||
#include "SDL_locale.h"
|
||||
#include "SDL_misc.h"
|
||||
|
||||
#include "begin_code.h"
|
||||
/* Set up for C function definitions, even when using C++ */
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* As of version 0.5, SDL is loaded dynamically into the application */
|
||||
|
||||
/**
|
||||
* \name SDL_INIT_*
|
||||
*
|
||||
* These are the flags which may be passed to SDL_Init(). You should
|
||||
* specify the subsystems which you will be using in your application.
|
||||
*/
|
||||
/* @{ */
|
||||
#define SDL_INIT_TIMER 0x00000001u
|
||||
#define SDL_INIT_AUDIO 0x00000010u
|
||||
#define SDL_INIT_VIDEO 0x00000020u /**< SDL_INIT_VIDEO implies SDL_INIT_EVENTS */
|
||||
#define SDL_INIT_JOYSTICK 0x00000200u /**< SDL_INIT_JOYSTICK implies SDL_INIT_EVENTS */
|
||||
#define SDL_INIT_HAPTIC 0x00001000u
|
||||
#define SDL_INIT_GAMECONTROLLER 0x00002000u /**< SDL_INIT_GAMECONTROLLER implies SDL_INIT_JOYSTICK */
|
||||
#define SDL_INIT_EVENTS 0x00004000u
|
||||
#define SDL_INIT_SENSOR 0x00008000u
|
||||
#define SDL_INIT_NOPARACHUTE 0x00100000u /**< compatibility; this flag is ignored. */
|
||||
#define SDL_INIT_EVERYTHING ( \
|
||||
SDL_INIT_TIMER | SDL_INIT_AUDIO | SDL_INIT_VIDEO | SDL_INIT_EVENTS | \
|
||||
SDL_INIT_JOYSTICK | SDL_INIT_HAPTIC | SDL_INIT_GAMECONTROLLER | SDL_INIT_SENSOR \
|
||||
)
|
||||
/* @} */
|
||||
|
||||
/**
|
||||
* Initialize the SDL library.
|
||||
*
|
||||
* SDL_Init() simply forwards to calling SDL_InitSubSystem(). Therefore, the
|
||||
* two may be used interchangeably. Though for readability of your code
|
||||
* SDL_InitSubSystem() might be preferred.
|
||||
*
|
||||
* The file I/O (for example: SDL_RWFromFile) and threading (SDL_CreateThread)
|
||||
* subsystems are initialized by default. Message boxes
|
||||
* (SDL_ShowSimpleMessageBox) also attempt to work without initializing the
|
||||
* video subsystem, in hopes of being useful in showing an error dialog when
|
||||
* SDL_Init fails. You must specifically initialize other subsystems if you
|
||||
* use them in your application.
|
||||
*
|
||||
* Logging (such as SDL_Log) works without initialization, too.
|
||||
*
|
||||
* `flags` may be any of the following OR'd together:
|
||||
*
|
||||
* - `SDL_INIT_TIMER`: timer subsystem
|
||||
* - `SDL_INIT_AUDIO`: audio subsystem
|
||||
* - `SDL_INIT_VIDEO`: video subsystem; automatically initializes the events
|
||||
* subsystem
|
||||
* - `SDL_INIT_JOYSTICK`: joystick subsystem; automatically initializes the
|
||||
* events subsystem
|
||||
* - `SDL_INIT_HAPTIC`: haptic (force feedback) subsystem
|
||||
* - `SDL_INIT_GAMECONTROLLER`: controller subsystem; automatically
|
||||
* initializes the joystick subsystem
|
||||
* - `SDL_INIT_EVENTS`: events subsystem
|
||||
* - `SDL_INIT_EVERYTHING`: all of the above subsystems
|
||||
* - `SDL_INIT_NOPARACHUTE`: compatibility; this flag is ignored
|
||||
*
|
||||
* Subsystem initialization is ref-counted, you must call SDL_QuitSubSystem()
|
||||
* for each SDL_InitSubSystem() to correctly shutdown a subsystem manually (or
|
||||
* call SDL_Quit() to force shutdown). If a subsystem is already loaded then
|
||||
* this call will increase the ref-count and return.
|
||||
*
|
||||
* \param flags subsystem initialization flags
|
||||
* \returns 0 on success or a negative error code on failure; call
|
||||
* SDL_GetError() for more information.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_InitSubSystem
|
||||
* \sa SDL_Quit
|
||||
* \sa SDL_SetMainReady
|
||||
* \sa SDL_WasInit
|
||||
*/
|
||||
extern DECLSPEC int SDLCALL SDL_Init(Uint32 flags);
|
||||
|
||||
/**
|
||||
* Compatibility function to initialize the SDL library.
|
||||
*
|
||||
* In SDL2, this function and SDL_Init() are interchangeable.
|
||||
*
|
||||
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
|
||||
* \returns 0 on success or a negative error code on failure; call
|
||||
* SDL_GetError() for more information.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_Init
|
||||
* \sa SDL_Quit
|
||||
* \sa SDL_QuitSubSystem
|
||||
*/
|
||||
extern DECLSPEC int SDLCALL SDL_InitSubSystem(Uint32 flags);
|
||||
|
||||
/**
|
||||
* Shut down specific SDL subsystems.
|
||||
*
|
||||
* If you start a subsystem using a call to that subsystem's init function
|
||||
* (for example SDL_VideoInit()) instead of SDL_Init() or SDL_InitSubSystem(),
|
||||
* SDL_QuitSubSystem() and SDL_WasInit() will not work. You will need to use
|
||||
* that subsystem's quit function (SDL_VideoQuit()) directly instead. But
|
||||
* generally, you should not be using those functions directly anyhow; use
|
||||
* SDL_Init() instead.
|
||||
*
|
||||
* You still need to call SDL_Quit() even if you close all open subsystems
|
||||
* with SDL_QuitSubSystem().
|
||||
*
|
||||
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_InitSubSystem
|
||||
* \sa SDL_Quit
|
||||
*/
|
||||
extern DECLSPEC void SDLCALL SDL_QuitSubSystem(Uint32 flags);
|
||||
|
||||
/**
|
||||
* Get a mask of the specified subsystems which are currently initialized.
|
||||
*
|
||||
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
|
||||
* \returns a mask of all initialized subsystems if `flags` is 0, otherwise it
|
||||
* returns the initialization status of the specified subsystems.
|
||||
*
|
||||
* The return value does not include SDL_INIT_NOPARACHUTE.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_Init
|
||||
* \sa SDL_InitSubSystem
|
||||
*/
|
||||
extern DECLSPEC Uint32 SDLCALL SDL_WasInit(Uint32 flags);
|
||||
|
||||
/**
|
||||
* Clean up all initialized subsystems.
|
||||
*
|
||||
* You should call this function even if you have already shutdown each
|
||||
* initialized subsystem with SDL_QuitSubSystem(). It is safe to call this
|
||||
* function even in the case of errors in initialization.
|
||||
*
|
||||
* If you start a subsystem using a call to that subsystem's init function
|
||||
* (for example SDL_VideoInit()) instead of SDL_Init() or SDL_InitSubSystem(),
|
||||
* then you must use that subsystem's quit function (SDL_VideoQuit()) to shut
|
||||
* it down before calling SDL_Quit(). But generally, you should not be using
|
||||
* those functions directly anyhow; use SDL_Init() instead.
|
||||
*
|
||||
* You can use this function with atexit() to ensure that it is run when your
|
||||
* application is shutdown, but it is not wise to do this from a library or
|
||||
* other dynamically loaded code.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_Init
|
||||
* \sa SDL_QuitSubSystem
|
||||
*/
|
||||
extern DECLSPEC void SDLCALL SDL_Quit(void);
|
||||
|
||||
/* Ends C function definitions when using C++ */
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#include "close_code.h"
|
||||
|
||||
#endif /* SDL_h_ */
|
||||
|
||||
/* vi: set ts=4 sw=4 expandtab: */
|
||||
98
LibWhisper/WhisperStream.swift
Normal file
98
LibWhisper/WhisperStream.swift
Normal file
@ -0,0 +1,98 @@
|
||||
import AVFoundation
|
||||
|
||||
public struct Segment {
|
||||
let text: String
|
||||
let t0: Int64
|
||||
let t1: Int64
|
||||
}
|
||||
|
||||
public typealias OrderedSegments = [Segment]
|
||||
|
||||
public extension OrderedSegments {
|
||||
var text: any StringProtocol {
|
||||
map { $0.text }.joined()
|
||||
}
|
||||
}
|
||||
|
||||
public class WhisperStream: Thread {
|
||||
let waiter = DispatchGroup()
|
||||
|
||||
@Published public private(set) var segments = OrderedSegments()
|
||||
@Published public private(set) var alive = true
|
||||
|
||||
let model: URL
|
||||
let device: CaptureDevice?
|
||||
let window: TimeInterval
|
||||
|
||||
public init(model: URL, device: CaptureDevice? = nil, window: TimeInterval = 300) {
|
||||
self.model = model
|
||||
self.device = device
|
||||
self.window = window
|
||||
super.init()
|
||||
}
|
||||
|
||||
public override func start() {
|
||||
waiter.enter()
|
||||
super.start()
|
||||
}
|
||||
|
||||
public override func main() {
|
||||
task()
|
||||
waiter.leave()
|
||||
}
|
||||
|
||||
public func join() {
|
||||
waiter.wait()
|
||||
}
|
||||
|
||||
func task() {
|
||||
model.path.withCString { modelCStr in
|
||||
var params = stream_default_params()
|
||||
params.model = modelCStr
|
||||
|
||||
if let device = device {
|
||||
params.capture_id = device.id
|
||||
}
|
||||
|
||||
let ctx = stream_init(params)
|
||||
if ctx == nil {
|
||||
return
|
||||
}
|
||||
|
||||
while !self.isCancelled {
|
||||
let errno = stream_run(ctx, Unmanaged.passUnretained(self).toOpaque()) {
|
||||
return Unmanaged<WhisperStream>.fromOpaque($3!).takeUnretainedValue().callback(
|
||||
text: $0 != nil ? String(cString: $0!) : nil,
|
||||
t0: $1,
|
||||
t1: $2
|
||||
)
|
||||
}
|
||||
if errno != 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
stream_free(ctx)
|
||||
alive = false
|
||||
}
|
||||
}
|
||||
|
||||
func callback(text: String?, t0: Int64, t1: Int64) -> Int32 {
|
||||
if segments.isEmpty || text == nil {
|
||||
segments.append(Segment(text: "", t0: -1, t1: -1))
|
||||
}
|
||||
if let text = text {
|
||||
segments[segments.count - 1] = Segment(text: text, t0: t0, t1: t1)
|
||||
}
|
||||
|
||||
var k = 0
|
||||
for segment in segments {
|
||||
if let last = segments.last, last.t0 - segment.t0 > Int64(window * 1000) {
|
||||
k += 1
|
||||
}
|
||||
}
|
||||
segments.removeFirst(k)
|
||||
|
||||
return 0
|
||||
}
|
||||
}
|
||||
240
LibWhisper/stream.cpp
Normal file
240
LibWhisper/stream.cpp
Normal file
@ -0,0 +1,240 @@
|
||||
// This code is based on the streaming example provided with whisper.cpp:
|
||||
// https://github.com/ggerganov/whisper.cpp/blob/ca21f7ab16694384fb74b1ba4f68b39f16540d23/examples/stream/stream.cpp
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "whisper.h"
|
||||
#include "stream.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
|
||||
using unique_whisper = std::unique_ptr<whisper_context, std::integral_constant<decltype(&whisper_free), &whisper_free>>;
|
||||
|
||||
struct stream_context {
|
||||
stream_params params;
|
||||
std::unique_ptr<audio_async> audio;
|
||||
unique_whisper whisper;
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<float> pcmf32_old;
|
||||
std::vector<float> pcmf32_new;
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> t_last;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
|
||||
int n_samples_step;
|
||||
int n_samples_len;
|
||||
int n_samples_keep;
|
||||
bool use_vad;
|
||||
int n_new_line;
|
||||
int n_iter = 0;
|
||||
};
|
||||
|
||||
struct stream_params stream_default_params() {
|
||||
return stream_params {
|
||||
/* .n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
||||
/* .step_ms =*/ 3000,
|
||||
/* .length_ms =*/ 10000,
|
||||
/* .keep_ms =*/ 200,
|
||||
/* .capture_id =*/ -1,
|
||||
/* .max_tokens =*/ 32,
|
||||
/* .audio_ctx =*/ 0,
|
||||
|
||||
/* .vad_thold =*/ 0.6f,
|
||||
/* .freq_thold =*/ 100.0f,
|
||||
|
||||
/* .speed_up =*/ false,
|
||||
/* .translate =*/ false,
|
||||
/* .print_special =*/ false,
|
||||
/* .no_context =*/ true,
|
||||
/* .no_timestamps =*/ false,
|
||||
|
||||
/* .language =*/ "en",
|
||||
/* .model =*/ "models/ggml-base.en.bin"
|
||||
};
|
||||
}
|
||||
|
||||
stream_context *stream_init(stream_params params) {
|
||||
auto ctx = std::make_unique<stream_context>();
|
||||
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms);
|
||||
params.length_ms = std::max(params.length_ms, params.step_ms);
|
||||
|
||||
ctx->n_samples_step = (1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE;
|
||||
ctx->n_samples_len = (1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE;
|
||||
ctx->n_samples_keep = (1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_30s = (1e-3 * 30000.0) * WHISPER_SAMPLE_RATE;
|
||||
|
||||
ctx->use_vad = ctx->n_samples_step <= 0; // sliding window mode uses VAD
|
||||
|
||||
ctx->n_new_line = !ctx->use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
|
||||
|
||||
params.no_timestamps = !ctx->use_vad;
|
||||
params.no_context |= ctx->use_vad;
|
||||
params.max_tokens = 0;
|
||||
|
||||
// init audio
|
||||
ctx->audio = std::make_unique<audio_async>(params.length_ms);
|
||||
if (!ctx->audio->init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->audio->resume();
|
||||
|
||||
// whisper init
|
||||
if (whisper_lang_id(params.language) == -1) {
|
||||
fprintf(stderr, "%s: unknown language '%s'\n", __func__, params.language);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((ctx->whisper = unique_whisper(whisper_init_from_file(params.model))) == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->pcmf32 = std::vector<float>(n_samples_30s, 0.0f);
|
||||
ctx->pcmf32_new = std::vector<float>(n_samples_30s, 0.0f);
|
||||
|
||||
ctx->t_last = std::chrono::high_resolution_clock::now();
|
||||
ctx->t_start = ctx->t_last;
|
||||
|
||||
ctx->params = params;
|
||||
|
||||
return ctx.release();
|
||||
}
|
||||
|
||||
void stream_free(stream_context *ctx) {
|
||||
ctx->audio = NULL;
|
||||
ctx->whisper = NULL;
|
||||
ctx->pcmf32.clear();
|
||||
ctx->pcmf32_old.clear();
|
||||
ctx->pcmf32_new.clear();
|
||||
ctx->prompt_tokens.clear();
|
||||
}
|
||||
|
||||
int stream_run(stream_context *ctx, void *callback_ctx, stream_callback_t callback) {
|
||||
auto params = ctx->params;
|
||||
auto whisper = ctx->whisper.get();
|
||||
|
||||
auto t_now = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (!ctx->use_vad) {
|
||||
while (true) {
|
||||
ctx->audio->get(params.step_ms, ctx->pcmf32_new);
|
||||
|
||||
if ((int)ctx->pcmf32_new.size() > 2 * ctx->n_samples_step) {
|
||||
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
|
||||
ctx->audio->clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((int)ctx->pcmf32_new.size() >= ctx->n_samples_step) {
|
||||
ctx->audio->clear();
|
||||
break;
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
const int n_samples_new = ctx->pcmf32_new.size();
|
||||
|
||||
// take up to params.length_ms audio from previous iteration
|
||||
const int n_samples_take = std::min((int)ctx->pcmf32_old.size(), std::max(0, ctx->n_samples_keep + ctx->n_samples_len - n_samples_new));
|
||||
|
||||
ctx->pcmf32.resize(n_samples_new + n_samples_take);
|
||||
|
||||
for (int i = 0; i < n_samples_take; i++) {
|
||||
ctx->pcmf32[i] = ctx->pcmf32_old[ctx->pcmf32_old.size() - n_samples_take + i];
|
||||
}
|
||||
|
||||
memcpy(ctx->pcmf32.data() + n_samples_take, ctx->pcmf32_new.data(), n_samples_new * sizeof(float));
|
||||
|
||||
ctx->pcmf32_old = ctx->pcmf32;
|
||||
} else {
|
||||
auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - ctx->t_last).count();
|
||||
if (t_diff < 2000) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
return 0;
|
||||
}
|
||||
|
||||
// process new audio
|
||||
ctx->audio->get(2000, ctx->pcmf32_new);
|
||||
|
||||
if (::vad_simple(ctx->pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
|
||||
ctx->audio->get(params.length_ms, ctx->pcmf32);
|
||||
} else {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
return 0;
|
||||
}
|
||||
|
||||
ctx->t_last = t_now;
|
||||
}
|
||||
|
||||
// run the inference
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = !ctx->use_vad;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language;
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
// disable temperature fallback
|
||||
wparams.temperature_inc = -1.0f;
|
||||
|
||||
wparams.prompt_tokens = params.no_context ? nullptr : ctx->prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = params.no_context ? 0 : ctx->prompt_tokens.size();
|
||||
|
||||
const int64_t t1 = (t_now - ctx->t_start).count() / 1000000;
|
||||
const int64_t t0 = std::max(0.0, t1 - ctx->pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE);
|
||||
|
||||
if (whisper_full(whisper, wparams, ctx->pcmf32.data(), ctx->pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: failed to process audio\n", __func__);
|
||||
return 6;
|
||||
}
|
||||
|
||||
const int n_segments = whisper_full_n_segments(whisper);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char *text = whisper_full_get_segment_text(whisper, i);
|
||||
|
||||
const int64_t segment_t0 = whisper_full_get_segment_t0(whisper, i);
|
||||
const int64_t segment_t1 = whisper_full_get_segment_t1(whisper, i);
|
||||
|
||||
callback(text, ctx->use_vad ? segment_t0 : t0, ctx->use_vad ? segment_t1 : t1, callback_ctx);
|
||||
}
|
||||
|
||||
++ctx->n_iter;
|
||||
|
||||
if (!ctx->use_vad && (ctx->n_iter % ctx->n_new_line) == 0) {
|
||||
callback(NULL, 0, 0, callback_ctx);
|
||||
|
||||
// keep part of the audio for next iteration to try to mitigate word boundary issues
|
||||
ctx->pcmf32_old = std::vector<float>(ctx->pcmf32.end() - ctx->n_samples_keep, ctx->pcmf32.end());
|
||||
|
||||
// Add tokens of the last full length segment as the prompt
|
||||
if (!params.no_context) {
|
||||
ctx->prompt_tokens.clear();
|
||||
|
||||
const int n_segments = whisper_full_n_segments(whisper);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const int token_count = whisper_full_n_tokens(whisper, i);
|
||||
for (int j = 0; j < token_count; ++j) {
|
||||
ctx->prompt_tokens.push_back(whisper_full_get_token_id(whisper, i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
42
LibWhisper/stream.h
Normal file
42
LibWhisper/stream.h
Normal file
@ -0,0 +1,42 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct stream_params {
|
||||
int32_t n_threads;
|
||||
int32_t step_ms;
|
||||
int32_t length_ms;
|
||||
int32_t keep_ms;
|
||||
int32_t capture_id;
|
||||
int32_t max_tokens;
|
||||
int32_t audio_ctx;
|
||||
|
||||
float vad_thold;
|
||||
float freq_thold;
|
||||
|
||||
bool speed_up;
|
||||
bool translate;
|
||||
bool print_special;
|
||||
bool no_context;
|
||||
bool no_timestamps;
|
||||
|
||||
const char *language;
|
||||
const char *model;
|
||||
} stream_params_t;
|
||||
|
||||
stream_params_t stream_default_params();
|
||||
|
||||
typedef struct stream_context *stream_context_t;
|
||||
|
||||
stream_context_t stream_init(stream_params_t params);
|
||||
void stream_free(stream_context_t ctx);
|
||||
|
||||
typedef int (*stream_callback_t) (const char *text, int64_t t0, int64_t t1, void *ctx);
|
||||
int stream_run(stream_context_t ctx, void *callback_ctx, stream_callback_t callback);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user