Files
bamort/backend/importer/detector.go
T
2026-02-27 12:00:40 +01:00

204 lines
5.1 KiB
Go

package importer
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"strings"
"sync"
"time"
)
// DetectionCache stores cached format detection results
type DetectionCache struct {
signature string
adapterID string
confidence float64
cachedAt time.Time
}
// Detector handles smart format detection with optimizations
type Detector struct {
registry *AdapterRegistry
cache map[string]*DetectionCache
cacheMu sync.RWMutex
cacheTTL time.Duration
stopCacheCleanup chan struct{}
}
// NewDetector creates a new detector
func NewDetector(registry *AdapterRegistry) *Detector {
d := &Detector{
registry: registry,
cache: make(map[string]*DetectionCache),
cacheTTL: 5 * time.Minute, // Default TTL
stopCacheCleanup: make(chan struct{}),
}
d.startCacheCleanup()
return d
}
// DetectFormat implements smart format detection with short-circuit optimization
// Priority:
// 1. User-specified adapter (if provided)
// 2. Extension match (if single match found)
// 3. Signature cache (SHA256 of first 1KB)
// 4. Full adapter detection fan-out
func (d *Detector) DetectFormat(data []byte, filename string, specifiedAdapterID string) (string, float64, error) {
// Step 1: User-specified adapter (highest priority)
if specifiedAdapterID != "" {
adapter := d.registry.Get(specifiedAdapterID)
if adapter == nil {
return "", 0, fmt.Errorf("specified adapter not found: %s", specifiedAdapterID)
}
if !adapter.Healthy {
return "", 0, fmt.Errorf("specified adapter is unhealthy: %s", specifiedAdapterID)
}
return specifiedAdapterID, 1.0, nil
}
// Step 2: Extension match (short-circuit if single match)
ext := getFileExtension(filename)
if ext != "" {
matches := d.getAdaptersByExtension(ext)
if len(matches) == 1 {
// Single match - short-circuit!
return matches[0].ID, 1.0, nil
}
// Multiple matches - continue to full detection
}
// Step 3: Signature cache
if cachedAdapterID, cachedConfidence := d.getCachedDetection(data); cachedAdapterID != "" {
return cachedAdapterID, cachedConfidence, nil
}
// Step 4: Full detection fan-out to all healthy adapters
adapterID, confidence, err := d.registry.Detect(data, filename)
if err != nil {
return "", 0, err
}
// Cache the successful detection
d.cacheDetection(data, adapterID, confidence)
return adapterID, confidence, nil
}
// getAdaptersByExtension returns all healthy adapters that support the given extension
func (d *Detector) getAdaptersByExtension(ext string) []*AdapterMetadata {
healthy := d.registry.GetHealthy()
matches := make([]*AdapterMetadata, 0)
ext = strings.ToLower(ext)
for _, adapter := range healthy {
if !adapter.SupportsCapability("detect") {
continue
}
for _, supportedExt := range adapter.SupportedExtensions {
if strings.ToLower(supportedExt) == ext {
matches = append(matches, adapter)
break
}
}
}
return matches
}
// getCachedDetection retrieves a cached detection result if available and not expired
func (d *Detector) getCachedDetection(data []byte) (string, float64) {
signature := d.computeSignature(data)
d.cacheMu.RLock()
defer d.cacheMu.RUnlock()
cached, exists := d.cache[signature]
if !exists {
return "", 0
}
// Check if cache entry is expired
if time.Since(cached.cachedAt) > d.cacheTTL {
return "", 0
}
return cached.adapterID, cached.confidence
}
// cacheDetection stores a detection result in the cache
func (d *Detector) cacheDetection(data []byte, adapterID string, confidence float64) {
signature := d.computeSignature(data)
d.cacheMu.Lock()
defer d.cacheMu.Unlock()
d.cache[signature] = &DetectionCache{
signature: signature,
adapterID: adapterID,
confidence: confidence,
cachedAt: time.Now(),
}
}
// computeSignature computes SHA256 hash of first 1KB of data
func (d *Detector) computeSignature(data []byte) string {
// Use first 1KB or full data if smaller
size := 1024
if len(data) < size {
size = len(data)
}
hash := sha256.Sum256(data[:size])
return hex.EncodeToString(hash[:])
}
// getFileExtension extracts the file extension from a filename (case-insensitive)
func getFileExtension(filename string) string {
if filename == "" {
return ""
}
// Find last dot
lastDot := strings.LastIndex(filename, ".")
if lastDot == -1 || lastDot == len(filename)-1 {
return ""
}
return strings.ToLower(filename[lastDot:])
}
// startCacheCleanup starts a background goroutine to periodically clean expired cache entries
func (d *Detector) startCacheCleanup() {
ticker := time.NewTicker(10 * time.Minute) // Cleanup every 10 minutes
go func() {
for {
select {
case <-ticker.C:
d.cleanupExpiredEntries()
case <-d.stopCacheCleanup:
ticker.Stop()
return
}
}
}()
}
// StopCacheCleanup stops the background cache cleanup goroutine
func (d *Detector) StopCacheCleanup() {
close(d.stopCacheCleanup)
}
// cleanupExpiredEntries removes expired entries from the cache
func (d *Detector) cleanupExpiredEntries() {
d.cacheMu.Lock()
defer d.cacheMu.Unlock()
now := time.Now()
for signature, entry := range d.cache {
if now.Sub(entry.cachedAt) > d.cacheTTL {
delete(d.cache, signature)
}
}
}