initial commit

This commit is contained in:
Sam
2023-03-26 17:31:42 -04:00
commit e3b5b090fb
51 changed files with 4222 additions and 0 deletions

View File

@ -0,0 +1,43 @@
public enum CaptureDeviceError: Error {
case sdlErrorCode(Int32)
}
public struct CaptureDevice {
public let id: Int32
public let name: String
public init(id: Int32, name: String) {
self.id = id
self.name = name
}
public static var devices: [CaptureDevice] {
get throws {
var devices = [CaptureDevice]()
let result = SDL_Init(SDL_INIT_AUDIO)
if result < 0 {
throw CaptureDeviceError.sdlErrorCode(result)
}
for i in 0..<SDL_GetNumAudioDevices(1) {
let name = String(cString: SDL_GetAudioDeviceName(i, 1))
devices.append(CaptureDevice(id: i, name: name))
}
return devices
}
}
}
extension CaptureDevice: Equatable {
public static func == (lhs: Self, rhs: Self) -> Bool {
return lhs.id == rhs.id
}
}
extension CaptureDevice: Hashable {
public func hash(into hasher: inout Hasher) {
hasher.combine(id)
}
}

15
LibWhisper/LibWhisper.h Normal file
View File

@ -0,0 +1,15 @@
#import <Foundation/Foundation.h>
//! Project version number for LibWhisper.
FOUNDATION_EXPORT double LibWhisperVersionNumber;
//! Project version string for LibWhisper.
FOUNDATION_EXPORT const unsigned char LibWhisperVersionString[];
// SDL functions used in CaptureDevice
#define SDL_INIT_AUDIO 0x00000010u
extern int SDL_Init(uint32_t flags);
extern int SDL_GetNumAudioDevices(int iscapture);
extern const char * SDL_GetAudioDeviceName(int index, int iscapture);
#import "stream.h"

233
LibWhisper/SDL.h Normal file
View File

@ -0,0 +1,233 @@
/*
Simple DirectMedia Layer
Copyright (C) 1997-2023 Sam Lantinga <slouken@libsdl.org>
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
/**
* \file SDL.h
*
* Main include header for the SDL library
*/
#ifndef SDL_h_
#define SDL_h_
#include "SDL_main.h"
#include "SDL_stdinc.h"
#include "SDL_assert.h"
#include "SDL_atomic.h"
#include "SDL_audio.h"
#include "SDL_clipboard.h"
#include "SDL_cpuinfo.h"
#include "SDL_endian.h"
#include "SDL_error.h"
#include "SDL_events.h"
#include "SDL_filesystem.h"
#include "SDL_gamecontroller.h"
#include "SDL_guid.h"
#include "SDL_haptic.h"
#include "SDL_hidapi.h"
#include "SDL_hints.h"
#include "SDL_joystick.h"
#include "SDL_loadso.h"
#include "SDL_log.h"
#include "SDL_messagebox.h"
#include "SDL_metal.h"
#include "SDL_mutex.h"
#include "SDL_power.h"
#include "SDL_render.h"
#include "SDL_rwops.h"
#include "SDL_sensor.h"
#include "SDL_shape.h"
#include "SDL_system.h"
#include "SDL_thread.h"
#include "SDL_timer.h"
#include "SDL_version.h"
#include "SDL_video.h"
#include "SDL_locale.h"
#include "SDL_misc.h"
#include "begin_code.h"
/* Set up for C function definitions, even when using C++ */
#ifdef __cplusplus
extern "C" {
#endif
/* As of version 0.5, SDL is loaded dynamically into the application */
/**
* \name SDL_INIT_*
*
* These are the flags which may be passed to SDL_Init(). You should
* specify the subsystems which you will be using in your application.
*/
/* @{ */
#define SDL_INIT_TIMER 0x00000001u
#define SDL_INIT_AUDIO 0x00000010u
#define SDL_INIT_VIDEO 0x00000020u /**< SDL_INIT_VIDEO implies SDL_INIT_EVENTS */
#define SDL_INIT_JOYSTICK 0x00000200u /**< SDL_INIT_JOYSTICK implies SDL_INIT_EVENTS */
#define SDL_INIT_HAPTIC 0x00001000u
#define SDL_INIT_GAMECONTROLLER 0x00002000u /**< SDL_INIT_GAMECONTROLLER implies SDL_INIT_JOYSTICK */
#define SDL_INIT_EVENTS 0x00004000u
#define SDL_INIT_SENSOR 0x00008000u
#define SDL_INIT_NOPARACHUTE 0x00100000u /**< compatibility; this flag is ignored. */
#define SDL_INIT_EVERYTHING ( \
SDL_INIT_TIMER | SDL_INIT_AUDIO | SDL_INIT_VIDEO | SDL_INIT_EVENTS | \
SDL_INIT_JOYSTICK | SDL_INIT_HAPTIC | SDL_INIT_GAMECONTROLLER | SDL_INIT_SENSOR \
)
/* @} */
/**
* Initialize the SDL library.
*
* SDL_Init() simply forwards to calling SDL_InitSubSystem(). Therefore, the
* two may be used interchangeably. Though for readability of your code
* SDL_InitSubSystem() might be preferred.
*
* The file I/O (for example: SDL_RWFromFile) and threading (SDL_CreateThread)
* subsystems are initialized by default. Message boxes
* (SDL_ShowSimpleMessageBox) also attempt to work without initializing the
* video subsystem, in hopes of being useful in showing an error dialog when
* SDL_Init fails. You must specifically initialize other subsystems if you
* use them in your application.
*
* Logging (such as SDL_Log) works without initialization, too.
*
* `flags` may be any of the following OR'd together:
*
* - `SDL_INIT_TIMER`: timer subsystem
* - `SDL_INIT_AUDIO`: audio subsystem
* - `SDL_INIT_VIDEO`: video subsystem; automatically initializes the events
* subsystem
* - `SDL_INIT_JOYSTICK`: joystick subsystem; automatically initializes the
* events subsystem
* - `SDL_INIT_HAPTIC`: haptic (force feedback) subsystem
* - `SDL_INIT_GAMECONTROLLER`: controller subsystem; automatically
* initializes the joystick subsystem
* - `SDL_INIT_EVENTS`: events subsystem
* - `SDL_INIT_EVERYTHING`: all of the above subsystems
* - `SDL_INIT_NOPARACHUTE`: compatibility; this flag is ignored
*
* Subsystem initialization is ref-counted, you must call SDL_QuitSubSystem()
* for each SDL_InitSubSystem() to correctly shutdown a subsystem manually (or
* call SDL_Quit() to force shutdown). If a subsystem is already loaded then
* this call will increase the ref-count and return.
*
* \param flags subsystem initialization flags
* \returns 0 on success or a negative error code on failure; call
* SDL_GetError() for more information.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_InitSubSystem
* \sa SDL_Quit
* \sa SDL_SetMainReady
* \sa SDL_WasInit
*/
extern DECLSPEC int SDLCALL SDL_Init(Uint32 flags);
/**
* Compatibility function to initialize the SDL library.
*
* In SDL2, this function and SDL_Init() are interchangeable.
*
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
* \returns 0 on success or a negative error code on failure; call
* SDL_GetError() for more information.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_Init
* \sa SDL_Quit
* \sa SDL_QuitSubSystem
*/
extern DECLSPEC int SDLCALL SDL_InitSubSystem(Uint32 flags);
/**
* Shut down specific SDL subsystems.
*
* If you start a subsystem using a call to that subsystem's init function
* (for example SDL_VideoInit()) instead of SDL_Init() or SDL_InitSubSystem(),
* SDL_QuitSubSystem() and SDL_WasInit() will not work. You will need to use
* that subsystem's quit function (SDL_VideoQuit()) directly instead. But
* generally, you should not be using those functions directly anyhow; use
* SDL_Init() instead.
*
* You still need to call SDL_Quit() even if you close all open subsystems
* with SDL_QuitSubSystem().
*
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_InitSubSystem
* \sa SDL_Quit
*/
extern DECLSPEC void SDLCALL SDL_QuitSubSystem(Uint32 flags);
/**
* Get a mask of the specified subsystems which are currently initialized.
*
* \param flags any of the flags used by SDL_Init(); see SDL_Init for details.
* \returns a mask of all initialized subsystems if `flags` is 0, otherwise it
* returns the initialization status of the specified subsystems.
*
* The return value does not include SDL_INIT_NOPARACHUTE.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_Init
* \sa SDL_InitSubSystem
*/
extern DECLSPEC Uint32 SDLCALL SDL_WasInit(Uint32 flags);
/**
* Clean up all initialized subsystems.
*
* You should call this function even if you have already shutdown each
* initialized subsystem with SDL_QuitSubSystem(). It is safe to call this
* function even in the case of errors in initialization.
*
* If you start a subsystem using a call to that subsystem's init function
* (for example SDL_VideoInit()) instead of SDL_Init() or SDL_InitSubSystem(),
* then you must use that subsystem's quit function (SDL_VideoQuit()) to shut
* it down before calling SDL_Quit(). But generally, you should not be using
* those functions directly anyhow; use SDL_Init() instead.
*
* You can use this function with atexit() to ensure that it is run when your
* application is shutdown, but it is not wise to do this from a library or
* other dynamically loaded code.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_Init
* \sa SDL_QuitSubSystem
*/
extern DECLSPEC void SDLCALL SDL_Quit(void);
/* Ends C function definitions when using C++ */
#ifdef __cplusplus
}
#endif
#include "close_code.h"
#endif /* SDL_h_ */
/* vi: set ts=4 sw=4 expandtab: */

View File

@ -0,0 +1,98 @@
import AVFoundation
public struct Segment {
let text: String
let t0: Int64
let t1: Int64
}
public typealias OrderedSegments = [Segment]
public extension OrderedSegments {
var text: any StringProtocol {
map { $0.text }.joined()
}
}
public class WhisperStream: Thread {
let waiter = DispatchGroup()
@Published public private(set) var segments = OrderedSegments()
@Published public private(set) var alive = true
let model: URL
let device: CaptureDevice?
let window: TimeInterval
public init(model: URL, device: CaptureDevice? = nil, window: TimeInterval = 300) {
self.model = model
self.device = device
self.window = window
super.init()
}
public override func start() {
waiter.enter()
super.start()
}
public override func main() {
task()
waiter.leave()
}
public func join() {
waiter.wait()
}
func task() {
model.path.withCString { modelCStr in
var params = stream_default_params()
params.model = modelCStr
if let device = device {
params.capture_id = device.id
}
let ctx = stream_init(params)
if ctx == nil {
return
}
while !self.isCancelled {
let errno = stream_run(ctx, Unmanaged.passUnretained(self).toOpaque()) {
return Unmanaged<WhisperStream>.fromOpaque($3!).takeUnretainedValue().callback(
text: $0 != nil ? String(cString: $0!) : nil,
t0: $1,
t1: $2
)
}
if errno != 0 {
break
}
}
stream_free(ctx)
alive = false
}
}
func callback(text: String?, t0: Int64, t1: Int64) -> Int32 {
if segments.isEmpty || text == nil {
segments.append(Segment(text: "", t0: -1, t1: -1))
}
if let text = text {
segments[segments.count - 1] = Segment(text: text, t0: t0, t1: t1)
}
var k = 0
for segment in segments {
if let last = segments.last, last.t0 - segment.t0 > Int64(window * 1000) {
k += 1
}
}
segments.removeFirst(k)
return 0
}
}

240
LibWhisper/stream.cpp Normal file
View File

@ -0,0 +1,240 @@
// This code is based on the streaming example provided with whisper.cpp:
// https://github.com/ggerganov/whisper.cpp/blob/ca21f7ab16694384fb74b1ba4f68b39f16540d23/examples/stream/stream.cpp
#include "common.h"
#include "common-sdl.h"
#include "whisper.h"
#include "stream.h"
#include <cassert>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <fstream>
using unique_whisper = std::unique_ptr<whisper_context, std::integral_constant<decltype(&whisper_free), &whisper_free>>;
struct stream_context {
stream_params params;
std::unique_ptr<audio_async> audio;
unique_whisper whisper;
std::vector<float> pcmf32;
std::vector<float> pcmf32_old;
std::vector<float> pcmf32_new;
std::vector<whisper_token> prompt_tokens;
std::chrono::time_point<std::chrono::high_resolution_clock> t_last;
std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
int n_samples_step;
int n_samples_len;
int n_samples_keep;
bool use_vad;
int n_new_line;
int n_iter = 0;
};
struct stream_params stream_default_params() {
return stream_params {
/* .n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/* .step_ms =*/ 3000,
/* .length_ms =*/ 10000,
/* .keep_ms =*/ 200,
/* .capture_id =*/ -1,
/* .max_tokens =*/ 32,
/* .audio_ctx =*/ 0,
/* .vad_thold =*/ 0.6f,
/* .freq_thold =*/ 100.0f,
/* .speed_up =*/ false,
/* .translate =*/ false,
/* .print_special =*/ false,
/* .no_context =*/ true,
/* .no_timestamps =*/ false,
/* .language =*/ "en",
/* .model =*/ "models/ggml-base.en.bin"
};
}
stream_context *stream_init(stream_params params) {
auto ctx = std::make_unique<stream_context>();
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
ctx->n_samples_step = (1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE;
ctx->n_samples_len = (1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE;
ctx->n_samples_keep = (1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE;
const int n_samples_30s = (1e-3 * 30000.0) * WHISPER_SAMPLE_RATE;
ctx->use_vad = ctx->n_samples_step <= 0; // sliding window mode uses VAD
ctx->n_new_line = !ctx->use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
params.no_timestamps = !ctx->use_vad;
params.no_context |= ctx->use_vad;
params.max_tokens = 0;
// init audio
ctx->audio = std::make_unique<audio_async>(params.length_ms);
if (!ctx->audio->init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return NULL;
}
ctx->audio->resume();
// whisper init
if (whisper_lang_id(params.language) == -1) {
fprintf(stderr, "%s: unknown language '%s'\n", __func__, params.language);
return NULL;
}
if ((ctx->whisper = unique_whisper(whisper_init_from_file(params.model))) == NULL) {
return NULL;
}
ctx->pcmf32 = std::vector<float>(n_samples_30s, 0.0f);
ctx->pcmf32_new = std::vector<float>(n_samples_30s, 0.0f);
ctx->t_last = std::chrono::high_resolution_clock::now();
ctx->t_start = ctx->t_last;
ctx->params = params;
return ctx.release();
}
void stream_free(stream_context *ctx) {
ctx->audio = NULL;
ctx->whisper = NULL;
ctx->pcmf32.clear();
ctx->pcmf32_old.clear();
ctx->pcmf32_new.clear();
ctx->prompt_tokens.clear();
}
int stream_run(stream_context *ctx, void *callback_ctx, stream_callback_t callback) {
auto params = ctx->params;
auto whisper = ctx->whisper.get();
auto t_now = std::chrono::high_resolution_clock::now();
if (!ctx->use_vad) {
while (true) {
ctx->audio->get(params.step_ms, ctx->pcmf32_new);
if ((int)ctx->pcmf32_new.size() > 2 * ctx->n_samples_step) {
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
ctx->audio->clear();
continue;
}
if ((int)ctx->pcmf32_new.size() >= ctx->n_samples_step) {
ctx->audio->clear();
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
const int n_samples_new = ctx->pcmf32_new.size();
// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int)ctx->pcmf32_old.size(), std::max(0, ctx->n_samples_keep + ctx->n_samples_len - n_samples_new));
ctx->pcmf32.resize(n_samples_new + n_samples_take);
for (int i = 0; i < n_samples_take; i++) {
ctx->pcmf32[i] = ctx->pcmf32_old[ctx->pcmf32_old.size() - n_samples_take + i];
}
memcpy(ctx->pcmf32.data() + n_samples_take, ctx->pcmf32_new.data(), n_samples_new * sizeof(float));
ctx->pcmf32_old = ctx->pcmf32;
} else {
auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - ctx->t_last).count();
if (t_diff < 2000) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
return 0;
}
// process new audio
ctx->audio->get(2000, ctx->pcmf32_new);
if (::vad_simple(ctx->pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
ctx->audio->get(params.length_ms, ctx->pcmf32);
} else {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
return 0;
}
ctx->t_last = t_now;
}
// run the inference
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = !ctx->use_vad;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language;
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
// disable temperature fallback
wparams.temperature_inc = -1.0f;
wparams.prompt_tokens = params.no_context ? nullptr : ctx->prompt_tokens.data();
wparams.prompt_n_tokens = params.no_context ? 0 : ctx->prompt_tokens.size();
const int64_t t1 = (t_now - ctx->t_start).count() / 1000000;
const int64_t t0 = std::max(0.0, t1 - ctx->pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE);
if (whisper_full(whisper, wparams, ctx->pcmf32.data(), ctx->pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", __func__);
return 6;
}
const int n_segments = whisper_full_n_segments(whisper);
for (int i = 0; i < n_segments; ++i) {
const char *text = whisper_full_get_segment_text(whisper, i);
const int64_t segment_t0 = whisper_full_get_segment_t0(whisper, i);
const int64_t segment_t1 = whisper_full_get_segment_t1(whisper, i);
callback(text, ctx->use_vad ? segment_t0 : t0, ctx->use_vad ? segment_t1 : t1, callback_ctx);
}
++ctx->n_iter;
if (!ctx->use_vad && (ctx->n_iter % ctx->n_new_line) == 0) {
callback(NULL, 0, 0, callback_ctx);
// keep part of the audio for next iteration to try to mitigate word boundary issues
ctx->pcmf32_old = std::vector<float>(ctx->pcmf32.end() - ctx->n_samples_keep, ctx->pcmf32.end());
// Add tokens of the last full length segment as the prompt
if (!params.no_context) {
ctx->prompt_tokens.clear();
const int n_segments = whisper_full_n_segments(whisper);
for (int i = 0; i < n_segments; ++i) {
const int token_count = whisper_full_n_tokens(whisper, i);
for (int j = 0; j < token_count; ++j) {
ctx->prompt_tokens.push_back(whisper_full_get_token_id(whisper, i, j));
}
}
}
}
return 0;
}

42
LibWhisper/stream.h Normal file
View File

@ -0,0 +1,42 @@
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct stream_params {
int32_t n_threads;
int32_t step_ms;
int32_t length_ms;
int32_t keep_ms;
int32_t capture_id;
int32_t max_tokens;
int32_t audio_ctx;
float vad_thold;
float freq_thold;
bool speed_up;
bool translate;
bool print_special;
bool no_context;
bool no_timestamps;
const char *language;
const char *model;
} stream_params_t;
stream_params_t stream_default_params();
typedef struct stream_context *stream_context_t;
stream_context_t stream_init(stream_params_t params);
void stream_free(stream_context_t ctx);
typedef int (*stream_callback_t) (const char *text, int64_t t0, int64_t t1, void *ctx);
int stream_run(stream_context_t ctx, void *callback_ctx, stream_callback_t callback);
#ifdef __cplusplus
}
#endif