go-two-tone-detector-wav/main.go

package main

import (
	"flag"
	"fmt"
	"log/slog"
	"math"
	"os"
	"time"

	"github.com/go-audio/audio"
	"github.com/go-audio/wav"
)

// Command-line flags
var (
	wavFile     = flag.String("wav", "", "Path to mono 8kHz WAV file")
	minAms      = flag.Int("minA", 1000, "Minimum Tone A duration (ms)")
	minBms      = flag.Int("minB", 3000, "Minimum Tone B duration (ms)")
	gapMaxMs    = flag.Int("gap", 5000, "Max gap between A and B (ms)")
	winMs       = flag.Int("win", 100, "Window size (ms)")
	hopMs       = flag.Int("hop", 50, "Hop size (ms)")
	ratioThresh = flag.Float64("ratio", 0.65, "Power ratio threshold for tone detection")
	rmsThresh   = flag.Float64("rms", 300.0, "Minimum RMS for valid signal")
	verbose     = flag.Bool("verbose", false, "Enable debug logging")
)

// Goertzel struct for frequency detection
type goertzel struct {
	N     int
	fs    float64
	k     int
	coeff float64
}

// newGoertzel initializes and returns a new Goertzel filter configured to detect a specific target frequency.
// targetHz specifies the frequency to detect in Hertz.
// fs is the sampling rate in Hertz.
// N is the number of samples to process.
// The function calculates the filter coefficients based on the provided parameters.
func newGoertzel(targetHz float64, fs float64, N int) *goertzel {
	g := &goertzel{N: N, fs: fs}
	g.k = int(0.5 + (float64(g.N)*targetHz)/fs)
	omega := (2.0 * math.Pi * float64(g.k)) / float64(g.N)
	g.coeff = 2.0 * math.Cos(omega)
	return g
}

// Power computes the power of the target frequency in the input signal x using the Goertzel algorithm.
// It processes the input slice x of length g.N, applying the Goertzel recurrence to accumulate state.
// The function returns the squared magnitude (power) of the frequency bin specified by g.k.
// x should be a slice of float64 samples, typically representing a windowed segment of a signal.
func (g *goertzel) Power(x []float64) float64 {
	var s0, s1, s2 float64
	for i := 0; i < g.N; i++ {
		s0 = x[i] + g.coeff*s1 - s2
		s2 = s1
		s1 = s0
	}
	omega := (2.0 * math.Pi * float64(g.k)) / float64(g.N)
	real := s1 - s2*math.Cos(omega)
	imag := s2 * math.Sin(omega)
	return real*real + imag*imag
}

// windowHann applies a Hann window to the input slice x in place.
// The Hann window is commonly used in signal processing to reduce spectral leakage
// by tapering the beginning and end of the signal to zero.
// The function modifies the input slice directly.
func windowHann(x []float64) {
	n := float64(len(x))
	for i := range x {
		x[i] *= 0.5 * (1.0 - math.Cos(2.0*math.Pi*float64(i)/(n-1.0)))
	}
}

// pcmToFloat converts a slice of 16-bit PCM audio samples (buf) to a slice of float64 values.
// The output slice has length N, and each element is the float64 representation of the corresponding PCM sample.
// If N is greater than the length of buf, the output slice will contain zero values for the remaining elements.
func pcmToFloat(buf []int16, N int) []float64 {
	out := make([]float64, N)
	for i := 0; i < N && i < len(buf); i++ {
		out[i] = float64(buf[i])
	}
	return out
}

// rmsPCM calculates the root mean square (RMS) value of a slice of 16-bit PCM audio samples.
// It returns the RMS as a float64, which represents the signal's effective amplitude.
// If the input slice is empty, it returns 0.
func rmsPCM(buf []int16) float64 {
	var s float64
	for _, v := range buf {
		f := float64(v)
		s += f * f
	}
	if len(buf) == 0 {
		return 0
	}
	return math.Sqrt(s / float64(len(buf)))
}

// twoToneDetector for detecting tone sequences
type twoToneDetector struct {
	fs          int
	winN        int
	hopN        int
	ratioThresh float64
	rmsThresh   float64
	minAms      int
	minBms      int
	gapMaxMs    int
	freqs       []float64
	gzBank      []*goertzel
	inA         bool
	aFreq       float64
	aAccumMs    int
	aStart      time.Time
	aEnd        time.Time
	waitingB    bool
	bFreq       float64
	bAccumMs    int
	bStart      time.Time
	bEnd        time.Time
	gapRemainMs int
	logger      *slog.Logger
}

// newTwoToneDetector creates and initializes a new twoToneDetector instance.
// It sets up a bank of Goertzel filters for frequencies ranging from 300 Hz to 3000 Hz in 10 Hz steps.
// Parameters:
//
//	fs         - Sample rate in Hz.
//	winN       - Window size for analysis.
//	hopN       - Hop size between windows.
//	ratioThresh- Threshold for tone ratio detection.
//	rmsThresh  - RMS threshold for signal detection.
//	minAms     - Minimum duration of tone A in milliseconds.
//	minBms     - Minimum duration of tone B in milliseconds.
//	gapMaxMs   - Maximum allowed gap between tones in milliseconds.
//	logger     - Logger for diagnostic output.
//
// Returns:
//
//	Pointer to a twoToneDetector configured with the specified parameters.
func newTwoToneDetector(fs, winN, hopN int, ratioThresh, rmsThresh float64, minAms, minBms, gapMaxMs int, logger *slog.Logger) *twoToneDetector {
	freqs := make([]float64, 0)
	for f := 300.0; f <= 3000.0; f += 10.0 {
		freqs = append(freqs, f)
	}
	gzBank := make([]*goertzel, len(freqs))
	for i, f := range freqs {
		gzBank[i] = newGoertzel(f, float64(fs), winN)
	}
	return &twoToneDetector{
		fs:          fs,
		winN:        winN,
		hopN:        hopN,
		ratioThresh: ratioThresh,
		rmsThresh:   rmsThresh,
		minAms:      minAms,
		minBms:      minBms,
		gapMaxMs:    gapMaxMs,
		freqs:       freqs,
		gzBank:      gzBank,
		logger:      logger,
	}
}

// stepWindow processes a window of PCM audio samples to detect a two-tone sequence.
// It applies a Hann window to the samples, computes RMS and power ratios, and tracks
// the presence and duration of two distinct tones (A and B) according to configured thresholds.
// The function returns an event string (e.g., "TWO_TONE_DETECTED") when a valid two-tone
// sequence is detected, along with the frequencies and durations (in milliseconds) of both tones,
// and the timestamp of detection.
//
// Parameters:
//
//	pcms    - Slice of int16 PCM audio samples for the current window.
//	t0      - Start time of the current window.
//
// Returns:
//
//	event     - Event string indicating detection status (e.g., "TWO_TONE_DETECTED" or "").
//	aFreq     - Frequency of detected Tone A (Hz).
//	aDur      - Duration of Tone A (milliseconds).
//	bFreq     - Frequency of detected Tone B (Hz).
//	bDur      - Duration of Tone B (milliseconds).
//	timestamp - Timestamp of detection (time.Time). Zero value if no event detected.
func (d *twoToneDetector) stepWindow(pcms []int16, t0 time.Time) (event string, aFreq, aDur, bFreq, bDur float64, timestamp time.Time) {
	xi := pcmToFloat(pcms, d.winN)
	windowHann(xi)

	var total float64
	for _, v := range xi {
		total += v * v
	}

	r := rmsPCM(pcms)
	hopDur := time.Millisecond * time.Duration(int(float64(d.hopN)*1000.0/float64(d.fs)))
	now := t0

	if r < d.rmsThresh {
		d.logger.Debug("RMS below threshold, resetting",
			"time", now.Format(time.RFC3339),
			"rms", fmt.Sprintf("%.2f", r),
			"threshold", d.rmsThresh)
		d.reset()
		return "", 0, 0, 0, 0, time.Time{}
	}

	// Find frequency with highest power
	bestIdx := -1
	bestPow := 0.0
	for i, gz := range d.gzBank {
		p := gz.Power(xi)
		if p > bestPow {
			bestPow = p
			bestIdx = i
		}
	}
	ratio := bestPow / (total + 1e-12)
	if ratio < d.ratioThresh {
		d.logger.Debug("Ratio below threshold, resetting",
			"time", now.Format(time.RFC3339),
			"ratio", fmt.Sprintf("%.3f", ratio),
			"threshold", d.ratioThresh)
		d.reset()
		return "", 0, 0, 0, 0, time.Time{}
	}
	freq := d.freqs[bestIdx]

	if !d.inA && !d.waitingB {
		// Looking for Tone A
		d.inA = true
		d.aFreq = freq
		d.aStart = now
	} else if d.inA && !d.waitingB {
		// Confirming Tone A
		if math.Abs(freq-d.aFreq) <= 10.0 {
			d.aAccumMs += int(hopDur.Milliseconds())
			d.aEnd = now.Add(hopDur)
			if d.aAccumMs >= d.minAms {
				d.inA = false
				d.waitingB = true
				d.gapRemainMs = d.gapMaxMs
			}
		} else {
			d.logger.Debug("Frequency differs from Tone A, resetting",
				"time", now.Format(time.RFC3339),
				"freq", fmt.Sprintf("%.1f", freq),
				"tone_a_freq", fmt.Sprintf("%.1f", d.aFreq))
			d.reset()
		}
	} else if d.waitingB {
		d.gapRemainMs -= int(hopDur.Milliseconds())
		if d.gapRemainMs <= 0 {
			d.logger.Debug("Gap exceeded max duration, resetting",
				"time", now.Format(time.RFC3339),
				"gap_max_ms", d.gapMaxMs)
			d.reset()
		} else if math.Abs(freq-d.aFreq) > 10.0 {
			// Check for Tone B
			if d.bAccumMs == 0 {
				d.bFreq = freq
				d.bStart = now
			} else if math.Abs(freq-d.bFreq) > 10.0 {
				d.logger.Debug("Frequency differs from Tone B, resetting B",
					"time", now.Format(time.RFC3339),
					"freq", fmt.Sprintf("%.1f", freq),
					"tone_b_freq", fmt.Sprintf("%.1f", d.bFreq))
				d.bFreq = freq
				d.bAccumMs = 0
				d.bStart = now
			}
			d.bAccumMs += int(hopDur.Milliseconds())
			d.bEnd = now.Add(hopDur)
			if d.bAccumMs >= d.minBms {
				event = "TWO_TONE_DETECTED"
				aDurMs := float64(d.aEnd.Sub(d.aStart).Milliseconds())
				bDurMs := float64(d.bEnd.Sub(d.bStart).Milliseconds())
				d.logger.Info("Two-tone detected",
					"time", now.Format(time.RFC3339),
					"tone_a_freq", fmt.Sprintf("%.1f", d.aFreq),
					"tone_a_duration_ms", fmt.Sprintf("%.0f", aDurMs),
					"tone_b_freq", fmt.Sprintf("%.1f", d.bFreq),
					"tone_b_duration_ms", fmt.Sprintf("%.0f", bDurMs))
				return event, d.aFreq, aDurMs, d.bFreq, bDurMs, now
			}
		}
	}
	return "", 0, 0, 0, 0, time.Time{}
}

// reset reinitializes all internal state fields of the twoToneDetector,
// clearing any ongoing detection data and preparing the detector for a new
// detection sequence. This includes resetting flags, frequency values,
// accumulated durations, start/end timestamps, and gap tracking.
func (d *twoToneDetector) reset() {
	d.inA = false
	d.aFreq = 0
	d.aAccumMs = 0
	d.aStart = time.Time{}
	d.aEnd = time.Time{}
	d.waitingB = false
	d.bFreq = 0
	d.bAccumMs = 0
	d.bStart = time.Time{}
	d.bEnd = time.Time{}
	d.gapRemainMs = 0
}

func main() {
	flag.Parse()

	// Initialize slog logger
	logLevel := &slog.LevelVar{}
	logLevel.Set(slog.LevelInfo)
	if *verbose {
		logLevel.Set(slog.LevelDebug)
	}
	logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{
		Level: logLevel,
	}))

	if *wavFile == "" {
		logger.Error("WAV file path is required", "flag", "-wav")
		os.Exit(1)
	}

	file, err := os.Open(*wavFile)
	if err != nil {
		logger.Error("Failed to open WAV file", "error", err)
		os.Exit(1)
	}
	defer file.Close()

	decoder := wav.NewDecoder(file)
	if !decoder.IsValidFile() {
		logger.Error("Invalid WAV file")
		os.Exit(1)
	}
	if decoder.Format().SampleRate != 8000 || decoder.Format().NumChannels != 1 {
		logger.Error("WAV file must be mono 8kHz",
			"sample_rate", decoder.Format().SampleRate,
			"channels", decoder.Format().NumChannels)
		os.Exit(1)
	}

	const fs = 8000
	winN := int(float64(fs) * float64(*winMs) / 1000.0)
	hopN := int(float64(fs) * float64(*hopMs) / 1000.0)
	if winN <= 0 || hopN <= 0 || hopN > winN {
		logger.Error("Invalid window/hop parameters",
			"winN", winN,
			"hopN", hopN)
		os.Exit(1)
	}

	det := newTwoToneDetector(fs, winN, hopN, *ratioThresh, *rmsThresh, *minAms, *minBms, *gapMaxMs, logger)

	buf := &audio.IntBuffer{
		Format:         &audio.Format{SampleRate: fs, NumChannels: 1},
		Data:           make([]int, 8192),
		SourceBitDepth: 16,
	}
	sampleCount := 0
	startTime := time.Now()

	logger.Info("Processing WAV file")
	for {
		n, err := decoder.PCMBuffer(buf)
		if err != nil || n == 0 || len(buf.Data) == 0 {
			logger.Info("Finished processing",
				"samples", sampleCount,
				"duration_sec", fmt.Sprintf("%.2f", float64(sampleCount)/float64(fs)))
			break
		}

		pcm := make([]int16, n)
		for i, v := range buf.Data[:n] {
			pcm[i] = int16(v)
		}
		sampleCount += n

		for offset := 0; offset <= len(pcm)-winN; offset += hopN {
			win := pcm[offset:min(offset+winN, len(pcm))]
			t := startTime.Add(time.Duration(sampleCount-len(pcm)+offset) * time.Second / time.Duration(fs))
			event, aFreq, aDur, bFreq, bDur, timestamp := det.stepWindow(win, t)
			if event != "" {
				fmt.Printf("Detected two-tone sequence at %s:\n", timestamp.Format(time.RFC3339))
				fmt.Printf("  Tone A: %.1f Hz, duration %.0f ms\n", aFreq, aDur)
				fmt.Printf("  Tone B: %.1f Hz, duration %.0f ms\n", bFreq, bDur)
				det.reset()
			}
		}
	}
}

// min returns the smaller of two integer values a and b.
func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}