secure-streams/minimal-secure-streams-alexa/audio.c

/*
 * alsa audio handling
 *
 * Written in 2010-2020 by Andy Green <andy@warmcat.com>
 *
 * This file is made available under the Creative Commons CC0 1.0
 * Universal Public Domain Dedication.
 */

#include <libwebsockets.h>
#include <string.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <alsa/asoundlib.h>
#include <pv_porcupine.h>

#include <mpg123.h>

#include "private.h"

extern struct lws_ss_handle *hss_avs_event, *hss_avs_sync;

int
avs_query_start(struct lws_context *context);

enum {
	MODE_IDLE,
	MODE_CAPTURING,
	MODE_PLAYING
};

struct raw_vhd {
	int16_t			p[8 * 1024]; /* 500ms at 16kHz 16-bit PCM */
	pv_porcupine_object_t	*porc;
	snd_pcm_t		*pcm_capture;
	snd_pcm_t		*pcm_playback;
	snd_pcm_hw_params_t	*params;
	snd_pcm_uframes_t	frames;
	int16_t			*porcbuf;

	mpg123_handle		*mh;

	mp3_done_cb		done_cb;
	void			*opaque;

	int			mode;
	int			rate;

	int			porc_spf;
	int			filefd;
	int			rpos;
	int			wpos;
	int			porcpos;
	int			npos;
	int			times;
	int			quietcount;
	int			anycount;

	int			wplay;
	int			rplay;

	char			last_wake_detect;
	char			destroy_mh_on_drain;
};

static struct raw_vhd *avhd;

/*
 * called from alexa.c to grab the next chunk of audio capture buffer
 * for upload
 */

int
spool_capture(uint8_t *buf, size_t len)
{
	int16_t *sam = (int16_t *)buf;
	size_t s, os;

	if (avhd->mode != MODE_CAPTURING)
		return -1;

	os = s = len / 2;

	while (s && avhd->wpos != avhd->npos) {
		*sam++ = avhd->p[avhd->npos];
		avhd->npos = (avhd->npos + 1)  % LWS_ARRAY_SIZE(avhd->p);
		s--;
	}

	lwsl_info("Copied %d samples (%d %d)\n", (int)(os - s),
			avhd->wpos, avhd->npos);

	return (os - s) * 2;
}

/*
 * Called from alexa.c to control when the mp3 playback should begin and end
 */

int
play_mp3(mpg123_handle *mh, mp3_done_cb cb, void *opaque)
{
	if (mh) {
		avhd->mh = mh;
		avhd->mode = MODE_PLAYING;
		snd_pcm_prepare(avhd->pcm_playback);

		return 0;
	}

	avhd->destroy_mh_on_drain = 1;
	avhd->done_cb = cb;
	avhd->opaque = opaque;

	return 0;
}

/*
 * Helper used to set alsa hwparams on both capture and playback channels
 */

static int
set_hw_params(struct lws_vhost *vh, snd_pcm_t **pcm, int type)
{
	unsigned int rate = pv_sample_rate(); /* it's 16kHz */
	snd_pcm_hw_params_t *params;
	lws_sock_file_fd_type u;
	struct pollfd pfd;
	struct lws *wsi1;
	int n;

	n = snd_pcm_open(pcm, "default", type, SND_PCM_NONBLOCK);
	if (n < 0) {
		lwsl_err("%s: Can't open default for playback: %s\n",
			 __func__, snd_strerror(n));

		return -1;
	}

	if (snd_pcm_poll_descriptors(*pcm, &pfd, 1) != 1) {
		lwsl_err("%s: failed to get playback desc\n", __func__);
		return -1;
	}

	u.filefd = (lws_filefd_type)(long long)pfd.fd;
	wsi1 = lws_adopt_descriptor_vhost(vh, LWS_ADOPT_RAW_FILE_DESC, u,
					  "lws-audio-test", NULL);
	if (!wsi1) {
		lwsl_err("%s: Failed to adopt playback desc\n", __func__);
		goto bail;
	}
	if (type == SND_PCM_STREAM_PLAYBACK)
		lws_rx_flow_control(wsi1, 0); /* no POLLIN */

	snd_pcm_hw_params_malloc(&params);
	snd_pcm_hw_params_any(*pcm, params);

	n = snd_pcm_hw_params_set_access(*pcm, params,
					 SND_PCM_ACCESS_RW_INTERLEAVED);
	if (n < 0)
		goto bail1;

	n = snd_pcm_hw_params_set_format(*pcm, params, SND_PCM_FORMAT_S16_LE);
	if (n < 0)
		goto bail1;

	n = snd_pcm_hw_params_set_channels(*pcm, params, 1);
	if (n < 0)
		goto bail1;

	n = snd_pcm_hw_params_set_rate_near(*pcm, params, &rate, 0);
	if (n < 0)
		goto bail1;

	lwsl_notice("%s: %s rate %d\n", __func__,
		type == SND_PCM_STREAM_PLAYBACK ? "Playback" : "Capture", rate);

	n = snd_pcm_hw_params(*pcm, params);
	snd_pcm_hw_params_free(params);
	if (n < 0)
		goto bail;

	return 0;

bail1:
	snd_pcm_hw_params_free(params);
bail:
	lwsl_err("%s: Set hw params failed: %s\n", __func__, snd_strerror(n));

	return -1;
}

/*
 * The lws RAW file protocol handler that wraps ALSA.
 *
 * The timing is coming from ALSA capture channel... since they are both set to
 * 16kHz, it's enough just to have the one.
 */

static int
callback_audio(struct lws *wsi, enum lws_callback_reasons reason, void *user,
	       void *in, size_t len)
{
	struct raw_vhd *vhd = (struct raw_vhd *)lws_protocol_vh_priv_get(
				   lws_get_vhost(wsi), lws_get_protocol(wsi));
	uint16_t rands[50];
	int16_t temp[256];
	bool det;
	long avg;
	int n, s;

	switch (reason) {
	case LWS_CALLBACK_PROTOCOL_INIT:

		if (avhd) /* just on one vhost */
			return 0;

		avhd = vhd = lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi),
				lws_get_protocol(wsi), sizeof(struct raw_vhd));

		/*
		 * Set up the wakeword library
		 */

		n = pv_porcupine_init("porcupine_params.pv", "alexa_linux.ppn",
					1.0, &vhd->porc);
		if (n) {
			lwsl_err("%s: porcupine init fail %d\n", __func__, n);

			return -1;
		}
		vhd->porc_spf = pv_porcupine_frame_length();
		vhd->porcbuf = malloc(vhd->porc_spf * 2);
		lwsl_info("%s: %s porc frame length is %d samples\n", __func__,
				lws_get_vhost_name(lws_get_vhost(wsi)),
				vhd->porc_spf);

		vhd->rate = pv_sample_rate(); /* 16kHz */

		/* set up alsa */

		if (set_hw_params(lws_get_vhost(wsi), &vhd->pcm_playback,
				  SND_PCM_STREAM_PLAYBACK))  {
			lwsl_err("%s: Can't open default for playback\n",
				 __func__);

			return -1;
		}

		if (set_hw_params(lws_get_vhost(wsi), &vhd->pcm_capture,
				  SND_PCM_STREAM_CAPTURE))  {
			lwsl_err("%s: Can't open default for capture\n",
				 __func__);

			return -1;
		}

		snd_config_update_free_global();

		break;

	case LWS_CALLBACK_PROTOCOL_DESTROY:
		lwsl_info("%s: LWS_CALLBACK_PROTOCOL_DESTROY\n", __func__);
		if (!vhd)
			break;

		if (vhd->porcbuf) {
			free(vhd->porcbuf);
			vhd->porcbuf = NULL;
		}
		if (vhd->pcm_playback) {
			snd_pcm_drop(vhd->pcm_playback);
			snd_pcm_close(vhd->pcm_playback);
			vhd->pcm_playback = NULL;
		}
		if (vhd->pcm_capture) {
			snd_pcm_drop(vhd->pcm_capture);
			snd_pcm_close(vhd->pcm_capture);
			vhd->pcm_capture = NULL;
		}
		if (vhd->porc) {
			pv_porcupine_delete(vhd->porc);
			vhd->porc = NULL;
		}

		/* avoid most of the valgrind mess from alsa */
		snd_config_update_free_global();

		break;

	case LWS_CALLBACK_RAW_CLOSE_FILE:
		lwsl_info("%s: closed\n", __func__);
		break;

	case LWS_CALLBACK_RAW_RX_FILE:
		/* we come here about every 250ms */

		/*
		 * Playing back the mp3?
		 */
		if (vhd->mode == MODE_PLAYING && vhd->mh) {
			size_t amt, try;

			do {
				try = snd_pcm_avail(vhd->pcm_playback);
				if (try > LWS_ARRAY_SIZE(vhd->p))
					try = LWS_ARRAY_SIZE(vhd->p);

				n = mpg123_read(vhd->mh, (uint8_t *)vhd->p,
						try * 2, &amt);
				lwsl_info("%s: PLAYING: mpg123 read %d, n %d\n",
						__func__, (int)amt, n);
				if (n == MPG123_NEW_FORMAT) {
					snd_pcm_start(vhd->pcm_playback);
					memset(vhd->p, 0, try);
					snd_pcm_writei(vhd->pcm_playback,
						       vhd->p, try / 2);
					snd_pcm_prepare(vhd->pcm_playback);
				}
			} while (n == MPG123_NEW_FORMAT);

			if (amt) {
				n = snd_pcm_writei(vhd->pcm_playback,
						   vhd->p, amt / 2);
				if (n < 0)
					lwsl_notice("%s: snd_pcm_writei: %d %s\n",
						    __func__, n, snd_strerror(n));
				if (n == -EPIPE) {
					lwsl_err("%s: did EPIPE prep\n", __func__);
					snd_pcm_prepare(vhd->pcm_playback);
				}
			} else
				if (vhd->destroy_mh_on_drain &&
				    n != MPG123_NEW_FORMAT) {
					snd_pcm_drain(vhd->pcm_playback);
					vhd->destroy_mh_on_drain = 0;
					lwsl_notice("%s: mp3 destroyed\n",
							__func__);
					mpg123_close(vhd->mh);
					mpg123_delete(vhd->mh);
					vhd->mh = NULL;
					vhd->mode = MODE_IDLE;

					if (vhd->done_cb)
						vhd->done_cb(vhd->opaque);
				}
		}

		/*
		 * Get the capture data
		 */

		n = snd_pcm_readi(vhd->pcm_capture, temp, LWS_ARRAY_SIZE(temp));
		s = 0;
		while (s < n) {
			vhd->p[(vhd->wpos + s) % LWS_ARRAY_SIZE(vhd->p)] = temp[s];
			s++;
		}

		if (vhd->mode == MODE_CAPTURING) {

			/*
			 * We are recording an utterance.
			 *
			 * Estimate the sound density in the frame by picking 50
			 * samples at random and averaging the sampled
			 * [abs()^2] / 10000 to create a Figure of Merit.
			 *
			 * Speaking on my laptop gets us 1000 - 5000, silence
			 * is typ under 30.  The wakeword tells us there was
			 * speech at the start, end the capture when there's
			 * ~750ms (12000 samples) under 125 FOM.
			 */

#define SILENCE_THRESH 125

			avg = 0;
			lws_get_random(lws_get_context(wsi), rands, sizeof(rands));
			for (s = 0; s < (int)LWS_ARRAY_SIZE(rands); s++) {
				long q;

				q = temp[rands[s] % n];

				avg += (q * q);
			}
			avg = (avg / (int)LWS_ARRAY_SIZE(rands)) / 10000;

			lwsl_notice("est audio energy: %ld %d\n", avg, vhd->mode);

			/*
			 * Only start looking for "silence" after 1.5s, in case
			 * he does a long pause after the wakeword
			 */

			if (vhd->anycount < (3 *vhd->rate) / 2 &&
			    avg < SILENCE_THRESH) {
				vhd->quietcount += n;
				/* then 500ms of "silence" does it for us */
				if (vhd->quietcount >= ((vhd->rate * 3) / 4)) {
					lwsl_warn("%s: ended capture\n", __func__);
					vhd->mode = MODE_IDLE;
					vhd->quietcount = 0;
				}
			}

			/* if we're not "silent", reset the count */
			if (avg > SILENCE_THRESH * 2)
				vhd->quietcount = 0;

			/*
			 * Since we are in capturing mode, we have something
			 * new to send now.
			 *
			 * We must send an extra one at the end so we can finish
			 * the tx.
			 */
			lws_ss_request_tx(hss_avs_sync);
		}

		/*
		 * Just waiting for a wakeword
		 */

		while (vhd->mode == MODE_IDLE) {
			int m = 0, ppold = vhd->porcpos;

			s = (vhd->wpos - vhd->porcpos) % LWS_ARRAY_SIZE(vhd->p);
			if (s < vhd->porc_spf)
				goto eol;

			while (m < vhd->porc_spf) {
				vhd->porcbuf[m++] = avhd->p[vhd->porcpos];
				vhd->porcpos = (vhd->porcpos + 1) %
							LWS_ARRAY_SIZE(vhd->p);
			}

			if (pv_porcupine_process(vhd->porc, vhd->porcbuf, &det))
				lwsl_err("%s: porc_process failed\n", __func__);

			if (!det && vhd->last_wake_detect &&
			    vhd->mode == MODE_IDLE) {
				lwsl_warn("************* Wakeword\n");
				if (!avs_query_start(lws_get_context(wsi))) {
					vhd->mode = MODE_CAPTURING;
					vhd->quietcount = 0;
					vhd->last_wake_detect = det;
					vhd->npos = ppold;
					break;
				}
			}
			vhd->last_wake_detect = det;
		}

eol:
		vhd->wpos = (vhd->wpos + n) % LWS_ARRAY_SIZE(vhd->p);
		break;

	default:
		break;
	}

	return 0;
}

struct lws_protocols protocol_audio_test =
	{ "lws-audio-test", callback_audio, 0, 0 };