docsys/main.go

668 lines
18 KiB
Go

package main
import (
"encoding/base64"
"encoding/csv"
"encoding/json"
"fmt"
"html/template"
"io"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
)
var (
tmplFuncs template.FuncMap
documentsDir string
recordsDir string
storeDir string
indexDir string
inboxDir string
)
func initPaths() {
documentsDir = os.Getenv("DOCSYS_DATA_DIR")
if documentsDir == "" {
documentsDir = "/srv/docsys"
}
recordsDir = filepath.Join(documentsDir, "records")
storeDir = filepath.Join(documentsDir, "store")
indexDir = filepath.Join(documentsDir, "index")
inboxDir = filepath.Join(documentsDir, "inbox")
}
var categories = []string{
"taxes", "bills", "medical", "insurance", "legal",
"financial", "expenses", "vehicles", "home", "personal", "contacts",
"inou", "sophia", "uncategorized",
}
func main() {
initPaths()
// Initialize database
dbPath := filepath.Join(indexDir, "docsys.db")
if err := InitDB(dbPath); err != nil {
log.Fatalf("Failed to initialize database: %v", err)
}
defer CloseDB()
// Note: Markdown record indexing disabled - we now store directly in DB
// if err := IndexDocumentsFromDirectory(recordsDir, storeDir, categories); err != nil {
// log.Printf("Warning: Failed to index documents: %v", err)
// }
// Ensure inbox directory exists
os.MkdirAll(inboxDir, 0755)
// Template functions
tmplFuncs = template.FuncMap{
"truncate": truncateText,
"categoryIcon": categoryIcon,
"formatDate": formatDate,
"formatDateTime": formatDateTime,
"lower": strings.ToLower,
"title": strings.Title,
"safe": func(s string) template.HTML { return template.HTML(s) },
"multiply": func(a float64, b float64) float64 { return a * b },
}
r := chi.NewRouter()
r.Use(middleware.Logger)
r.Use(middleware.Recoverer)
r.Use(middleware.Compress(5))
// Static files
r.Handle("/static/*", http.StripPrefix("/static/", http.FileServer(http.Dir("static"))))
// PDF serving
r.Get("/pdf/{hash}", servePDF)
// Pages
r.Get("/", dashboardHandler)
r.Get("/browse", browseHandler)
r.Get("/browse/{category}", browseCategoryHandler)
r.Get("/document/{id}", documentHandler)
r.Get("/search", searchHandler)
// API endpoints
r.Post("/api/search", apiSearchHandler)
r.Get("/api/documents", apiDocumentsHandler)
r.Get("/api/processing", apiProcessingHandler)
r.Post("/api/upload", uploadHandler)
r.Post("/api/ingest", ingestHandler)
r.Put("/api/document/{id}", updateDocumentHandler)
r.Delete("/api/document/{id}", deleteDocumentHandler)
r.Get("/api/export", exportCSVHandler)
r.Post("/api/share/{id}", createShareHandler)
r.Delete("/api/share/{token}", deleteShareHandler)
r.Get("/api/shares/{id}", listSharesHandler)
r.Get("/s/{token}", publicShareHandler)
r.Post("/api/reindex", reindexHandler)
r.Get("/api/debug/stats", debugStatsHandler)
// Watch inbox directory for new files (scanner via SFTP, web upload, etc.)
StartInboxWatcher()
port := ":9201"
log.Printf("🗂️ DocSys starting on http://localhost%s", port)
log.Printf("📁 Documents: %s", documentsDir)
log.Fatal(http.ListenAndServe(port, r))
}
// Template helpers
func truncateText(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "..."
}
func categoryIcon(cat string) string {
icons := map[string]string{
"taxes": "📋",
"bills": "💰",
"medical": "🏥",
"insurance": "🛡️",
"legal": "⚖️",
"financial": "🏦",
"expenses": "💳",
"vehicles": "🚗",
"home": "🏠",
"personal": "👤",
"contacts": "📇",
"inou": "🏢",
"sophia": "👧",
"uncategorized": "📁",
}
if icon, ok := icons[cat]; ok {
return icon
}
return "📄"
}
func formatDate(s string) string {
formats := []string{
"2006-01-02T15:04:05.999999",
"2006-01-02T15:04:05",
"2006-01-02",
"January 2, 2006",
"january 2, 2006",
}
for _, f := range formats {
if t, err := time.Parse(f, s); err == nil {
return t.Format("Jan 2, 2006")
}
}
return s
}
func formatDateTime(s string) string {
formats := []string{
"2006-01-02T15:04:05-07:00",
"2006-01-02T15:04:05.999999-07:00",
"2006-01-02T15:04:05.999999",
"2006-01-02T15:04:05",
"2006-01-02 15:04:05",
"2006-01-02",
}
loc, _ := time.LoadLocation("America/New_York")
if loc == nil {
loc = time.UTC
}
for _, f := range formats {
if t, err := time.Parse(f, s); err == nil {
t = t.In(loc)
return t.Format("Jan 02, 2006 3:04 PM MST")
}
}
return s
}
// Template rendering
func renderTemplate(w http.ResponseWriter, name string, data interface{}) {
tmpl := template.Must(template.New("").Funcs(tmplFuncs).ParseFiles(
"templates/base.html",
"templates/"+name+".html",
))
if err := tmpl.ExecuteTemplate(w, "base", data); err != nil {
log.Printf("Template error: %v", err)
http.Error(w, "Template error", http.StatusInternalServerError)
}
}
func renderPartial(w http.ResponseWriter, name string, data interface{}) {
tmpl := template.Must(template.New("").Funcs(tmplFuncs).ParseFiles(
"templates/partials/" + name + ".html",
))
if err := tmpl.ExecuteTemplate(w, "partials/"+name+".html", data); err != nil {
log.Printf("Template error: %v", err)
http.Error(w, "Template error", http.StatusInternalServerError)
}
}
// Page handlers
func dashboardHandler(w http.ResponseWriter, r *http.Request) {
stats, _ := GetStats()
renderTemplate(w, "dashboard", map[string]interface{}{
"Title": "Dashboard",
"Stats": stats,
"Categories": categories,
})
}
func browseHandler(w http.ResponseWriter, r *http.Request) {
renderTemplate(w, "browse", map[string]interface{}{
"Title": "Browse Documents",
"Categories": categories,
"CatStats": GetCategoryStats(categories),
})
}
func browseCategoryHandler(w http.ResponseWriter, r *http.Request) {
category := chi.URLParam(r, "category")
docs, _ := GetDocumentsByCategory(category)
renderTemplate(w, "category", map[string]interface{}{
"Title": strings.Title(category),
"Category": category,
"Documents": docs,
})
}
func documentHandler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
doc, err := GetDocument(id)
if err != nil {
http.Error(w, "Document not found", http.StatusNotFound)
return
}
renderTemplate(w, "document", map[string]interface{}{
"Title": doc.Title,
"Document": doc,
"Categories": categories,
})
}
func searchHandler(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
var docs []Document
if query != "" {
// Try FTS first
docs, _ = SearchDocuments(query, 50)
// If no keyword results, try semantic search
if len(docs) == 0 {
if emb, err := GenerateEmbedding(query); err == nil {
docs, _ = SemanticSearch(emb, 10)
}
}
}
renderTemplate(w, "search", map[string]interface{}{
"Title": "Search",
"Query": query,
"Documents": docs,
})
}
func servePDF(w http.ResponseWriter, r *http.Request) {
hash := chi.URLParam(r, "hash")
// Try PDF first, then TXT
for _, ext := range []string{".pdf", ".txt"} {
path := filepath.Join(storeDir, hash+ext)
if _, err := os.Stat(path); err == nil {
if ext == ".pdf" {
w.Header().Set("Content-Type", "application/pdf")
} else {
w.Header().Set("Content-Type", "text/plain")
}
http.ServeFile(w, r, path)
return
}
}
// Try without extension
path := filepath.Join(storeDir, hash)
if _, err := os.Stat(path); err == nil {
http.ServeFile(w, r, path)
return
}
http.Error(w, "File not found", http.StatusNotFound)
}
// API handlers
func apiSearchHandler(w http.ResponseWriter, r *http.Request) {
query := r.FormValue("q")
if query == "" {
w.Write([]byte(""))
return
}
docs, err := SearchDocuments(query, 50)
if err != nil {
// Fallback to simple search
docs, _ = SearchDocumentsFallback(query, 50)
}
renderPartial(w, "document-list", docs)
}
func apiProcessingHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(GetActiveJobs())
}
func apiDocumentsHandler(w http.ResponseWriter, r *http.Request) {
category := r.URL.Query().Get("category")
var docs []Document
if category != "" {
docs, _ = GetDocumentsByCategory(category)
} else {
docs, _ = GetAllDocuments()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(docs)
}
func uploadHandler(w http.ResponseWriter, r *http.Request) {
r.ParseMultipartForm(32 << 20) // 32MB max
file, header, err := r.FormFile("file")
if err != nil {
http.Error(w, "Failed to read file", http.StatusBadRequest)
return
}
defer file.Close()
// Save to inbox
filename := fmt.Sprintf("%d_%s", time.Now().Unix(), header.Filename)
destPath := filepath.Join(inboxDir, filename)
dest, err := os.Create(destPath)
if err != nil {
http.Error(w, "Failed to save file", http.StatusInternalServerError)
return
}
defer dest.Close()
io.Copy(dest, file)
// Check for duplicate before processing
hash, _ := FileHash(destPath)
existingDoc, _ := GetDocument(hash)
if existingDoc != nil && existingDoc.Status != "processing" {
// Document already exists — remove inbox file, return existing
os.Remove(destPath)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"status": "duplicate",
"filename": filename,
"message": "Document already exists in your library.",
"document": map[string]string{
"id": existingDoc.ID,
"title": existingDoc.Title,
"category": existingDoc.Category,
},
})
return
}
// Create pending document immediately (shows in UI right away)
InsertPendingDocument(hash, header.Filename)
// Process document (async)
go func() {
if doc, err := ProcessDocument(destPath); err != nil {
log.Printf("Process error for %s: %v", filename, err)
UpdateDocumentStatus(hash, "error")
} else {
log.Printf("Processed: %s → %s/%s", filename, doc.Category, doc.ID)
}
}()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"status": "success",
"filename": filename,
"id": hash,
"message": "Processing...",
})
}
// ingestHandler accepts JSON with base64-encoded file content
// POST /api/ingest
// {
// "filename": "invoice.pdf",
// "content": "<base64-encoded-data>",
// "source": "email", // optional metadata
// "subject": "Your invoice", // optional
// "from": "billing@example.com" // optional
// }
func ingestHandler(w http.ResponseWriter, r *http.Request) {
var req struct {
Filename string `json:"filename"`
Content string `json:"content"`
Source string `json:"source"`
Subject string `json:"subject"`
From string `json:"from"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid JSON", http.StatusBadRequest)
return
}
if req.Filename == "" || req.Content == "" {
http.Error(w, "filename and content are required", http.StatusBadRequest)
return
}
// Decode base64 content
data, err := base64.StdEncoding.DecodeString(req.Content)
if err != nil {
http.Error(w, "Invalid base64 content", http.StatusBadRequest)
return
}
// Sanitize filename
safeName := strings.ReplaceAll(req.Filename, "/", "_")
safeName = strings.ReplaceAll(safeName, "\\", "_")
// Generate unique filename with timestamp
filename := fmt.Sprintf("%d_%s", time.Now().Unix(), safeName)
destPath := filepath.Join(inboxDir, filename)
// Write file
if err := os.WriteFile(destPath, data, 0644); err != nil {
http.Error(w, "Failed to write file", http.StatusInternalServerError)
return
}
// Process immediately (async)
go func() {
if doc, err := ProcessDocument(destPath); err != nil {
log.Printf("Process error for %s: %v", filename, err)
} else {
// Store email metadata if provided
if req.Source != "" || req.Subject != "" || req.From != "" {
doc.Metadata = map[string]string{
"source": req.Source,
"subject": req.Subject,
"from": req.From,
}
UpdateDocumentMetadata(doc.ID, doc.Metadata)
}
log.Printf("Processed: %s → %s/%s", filename, doc.Category, doc.ID)
}
}()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"status": "success",
"filename": filename,
"message": "Document ingested. Processing started.",
})
}
func updateDocumentHandler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
var update struct {
Title string `json:"title"`
Category string `json:"category"`
Notes string `json:"notes"`
}
if err := json.NewDecoder(r.Body).Decode(&update); err != nil {
http.Error(w, "Invalid request", http.StatusBadRequest)
return
}
// Get current document to check if category changed
doc, err := GetDocument(id)
if err != nil {
http.Error(w, "Document not found", http.StatusNotFound)
return
}
// Update in database
if err := UpdateDocument(id, DocumentUpdate{
Title: update.Title,
Category: update.Category,
Notes: update.Notes,
}); err != nil {
http.Error(w, "Failed to update", http.StatusInternalServerError)
return
}
// Move file if category changed
if doc.Category != update.Category && doc.RecordPath != "" {
newDir := filepath.Join(recordsDir, update.Category)
os.MkdirAll(newDir, 0755)
newPath := filepath.Join(newDir, filepath.Base(doc.RecordPath))
if err := os.Rename(doc.RecordPath, newPath); err == nil {
UpdateDocumentRecordPath(id, newPath)
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "success"})
}
func deleteDocumentHandler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
doc, err := GetDocument(id)
if err != nil {
http.Error(w, "Document not found", http.StatusNotFound)
return
}
// Delete from database (includes FTS and embeddings)
DeleteDocument(id)
// Delete record file
if doc.RecordPath != "" {
os.Remove(doc.RecordPath)
}
// Delete store files (PDF/TXT)
for _, ext := range []string{".pdf", ".txt", ""} {
path := filepath.Join(storeDir, id+ext)
os.Remove(path) // ignore errors for non-existent files
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "deleted"})
}
func exportCSVHandler(w http.ResponseWriter, r *http.Request) {
category := r.URL.Query().Get("category")
var docs []Document
if category != "" {
docs, _ = GetDocumentsByCategory(category)
} else {
docs, _ = GetAllDocuments()
}
w.Header().Set("Content-Type", "text/csv")
w.Header().Set("Content-Disposition", "attachment; filename=documents.csv")
writer := csv.NewWriter(w)
writer.Write([]string{"ID", "Title", "Category", "Type", "Date", "Amount", "Vendor", "Summary"})
for _, doc := range docs {
writer.Write([]string{
doc.ID, doc.Title, doc.Category, doc.Type,
doc.Date, doc.Amount, doc.Vendor, doc.Summary,
})
}
writer.Flush()
}
func debugStatsHandler(w http.ResponseWriter, r *http.Request) {
stats, err := GetStats()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"error": err,
"total": stats.TotalDocs,
"recent": stats.RecentDocs,
"uploadsCount": len(stats.RecentUploads),
"recentUploads": stats.RecentUploads,
})
}
func createShareHandler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
var req struct {
Days int `json:"days"`
}
req.Days = 7 // default
json.NewDecoder(r.Body).Decode(&req)
token, err := CreateShare(id, req.Days)
if err != nil {
http.Error(w, "Failed to create share", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"token": token,
"url": "/s/" + token + ".pdf",
})
}
func deleteShareHandler(w http.ResponseWriter, r *http.Request) {
token := chi.URLParam(r, "token")
DeleteShare(token)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "deleted"})
}
func listSharesHandler(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
shares, err := GetSharesByDocument(id)
if err != nil {
shares = []Share{}
}
if shares == nil {
shares = []Share{}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(shares)
}
func publicShareHandler(w http.ResponseWriter, r *http.Request) {
token := chi.URLParam(r, "token")
token = strings.TrimSuffix(token, ".pdf")
doc, err := GetShare(token)
if err != nil || doc == nil {
http.Error(w, "Not found", http.StatusNotFound)
return
}
// Serve the PDF directly
for _, ext := range []string{".pdf", ".txt", ""} {
path := filepath.Join(storeDir, doc.ID+ext)
if _, err := os.Stat(path); err == nil {
if ext == ".pdf" || ext == "" {
w.Header().Set("Content-Type", "application/pdf")
} else {
w.Header().Set("Content-Type", "text/plain")
}
// Set filename so Android/browsers handle it properly
filename := strings.ReplaceAll(doc.Title, " ", "-") + ext
if ext == "" {
filename += ".pdf"
}
w.Header().Set("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, filename))
http.ServeFile(w, r, path)
return
}
}
http.Error(w, "File not found", http.StatusNotFound)
}
func reindexHandler(w http.ResponseWriter, r *http.Request) {
// DISABLED - this was destructive (wiped all docs without repopulating)
// Old behavior cleared all docs then re-indexed markdown files (which we don't use anymore)
// TODO: Implement safe reprocessing that doesn't delete existing docs
log.Printf("Reindex endpoint called but disabled (would wipe all data)")
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "reindexed"})
}