gis/internal/domain/dataset.go

226 lines
7.9 KiB
Go

package domain
import (
"bytes"
"encoding/json"
"fmt"
"time"
"github.com/google/uuid"
)
// FileType classifies the kind of geo file a dataset holds.
type FileType string
const (
FileTypeVectorWithKato FileType = "vector_with_kato"
FileTypeVector FileType = "vector"
FileTypeRaster FileType = "raster"
)
// Valid reports whether the file type is one of the known values.
func (ft FileType) Valid() bool {
_, ok := allowedExtensions[ft]
return ok
}
// Dataset lifecycle statuses.
const (
// DatasetStatusPending is the initial state before any processing.
DatasetStatusPending = "pending"
// DatasetStatusParsing means a vector_with_kato file's attribute table is
// being parsed asynchronously.
DatasetStatusParsing = "parsing"
// DatasetStatusProcessing means a raster is being converted to a
// Cloud-Optimized GeoTIFF.
DatasetStatusProcessing = "processing"
// DatasetStatusAwaitingMapping means columns were detected and the user must
// choose the KATO column and map year columns.
DatasetStatusAwaitingMapping = "awaiting_mapping"
// DatasetStatusExtracting means the mapping was saved and the attribute table
// is being unpivoted into observations.
DatasetStatusExtracting = "extracting"
// DatasetStatusReady means the dataset is fully configured and extracted.
DatasetStatusReady = "ready"
// DatasetStatusFailed means parsing or extraction failed; see ParseError.
DatasetStatusFailed = "failed"
)
// datasetStatuses is the set of valid dataset lifecycle statuses.
var datasetStatuses = map[string]struct{}{
DatasetStatusPending: {},
DatasetStatusParsing: {},
DatasetStatusProcessing: {},
DatasetStatusAwaitingMapping: {},
DatasetStatusExtracting: {},
DatasetStatusReady: {},
DatasetStatusFailed: {},
}
// ValidDatasetStatus reports whether s is a known dataset lifecycle status.
func ValidDatasetStatus(s string) bool {
_, ok := datasetStatuses[s]
return ok
}
// DatasetFilter holds optional filters for listing dataset summaries. A nil
// field places no constraint on that attribute; listings are always ordered by
// created_at descending regardless of the filter.
type DatasetFilter struct {
CategoryID *uuid.UUID
FileType *FileType
Automated *bool
Status *string
}
// Observation is a single unpivoted value from a dataset's attribute table,
// keyed by KATO code and date. Exactly one of Value / ValueText is typically
// set (numeric vs non-numeric cell); both may be nil for an empty cell.
type Observation struct {
ID uuid.UUID `json:"id"`
DatasetID uuid.UUID `json:"dataset_id"`
KatoCode string `json:"kato_code"`
Date string `json:"date"`
Value *float64 `json:"value"`
ValueText *string `json:"value_text"`
}
// allowedExtensions lists the accepted lowercase file extensions (including the
// dot) for each file type.
var allowedExtensions = map[FileType][]string{
FileTypeVectorWithKato: {".zip", ".geojson", ".gpkg"},
FileTypeVector: {".geojson", ".gpkg", ".zip"},
FileTypeRaster: {".tif", ".tiff"},
}
// AllowedExtensions returns the accepted extensions for a file type.
func AllowedExtensions(ft FileType) []string {
return allowedExtensions[ft]
}
// ExtensionAllowedFor reports whether ext (lowercase, with dot) is valid for ft.
func ExtensionAllowedFor(ft FileType, ext string) bool {
for _, e := range allowedExtensions[ft] {
if e == ext {
return true
}
}
return false
}
// ValidateFileContent performs a lightweight magic-byte/shape check that an
// uploaded file's content matches its extension, catching mislabeled uploads at
// request time. head is the first bytes of the file; the worker performs the
// full parse/convert later.
func ValidateFileContent(ext string, head []byte) error {
switch ext {
case ".tif", ".tiff":
// TIFF: little-endian "II*\0" or big-endian "MM\0*".
if !bytes.HasPrefix(head, []byte("II*\x00")) && !bytes.HasPrefix(head, []byte("MM\x00*")) {
return fmt.Errorf("file is not a valid TIFF/GeoTIFF")
}
case ".zip":
// ZIP local-file or empty-archive signature.
if !bytes.HasPrefix(head, []byte("PK\x03\x04")) && !bytes.HasPrefix(head, []byte("PK\x05\x06")) {
return fmt.Errorf("file is not a valid ZIP archive")
}
case ".gpkg":
// GeoPackage is an SQLite 3 database.
if !bytes.HasPrefix(head, []byte("SQLite format 3\x00")) {
return fmt.Errorf("file is not a valid GeoPackage (SQLite) file")
}
case ".geojson", ".json":
// A GeoJSON FeatureCollection/Feature is a JSON object.
if b, ok := firstMeaningfulByte(head); !ok || b != '{' {
return fmt.Errorf("file is not valid GeoJSON")
}
}
return nil
}
// firstMeaningfulByte returns the first non-whitespace byte after an optional
// UTF-8 BOM.
func firstMeaningfulByte(head []byte) (byte, bool) {
head = bytes.TrimPrefix(head, []byte{0xEF, 0xBB, 0xBF})
for _, b := range head {
switch b {
case ' ', '\t', '\r', '\n':
continue
default:
return b, true
}
}
return 0, false
}
// AttributeColumn is a detected column from a vector file's attribute table,
// with a few sample values to help the user identify it (e.g. the KATO column).
type AttributeColumn struct {
Name string `json:"name"`
Samples []string `json:"samples,omitempty"`
}
// YearColumn maps an attribute column to the date it represents,
// e.g. {"column": "F_2023", "date": "2023-01-01"}.
type YearColumn struct {
Column string `json:"column"`
Date string `json:"date"`
}
// DatasetSummary is the lightweight view of a dataset used in list responses.
// It omits the heavy geometry/attribute/JSONB fields.
type DatasetSummary struct {
ID uuid.UUID `json:"id"`
CategoryID uuid.UUID `json:"category_id"`
Name string `json:"name"`
Description *string `json:"description"`
Unit *string `json:"unit"`
FileType FileType `json:"file_type"`
SizeBytes int64 `json:"size_bytes"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// Dataset is a geo file stored in the object store and grouped under a category.
type Dataset struct {
ID uuid.UUID `json:"id"`
CategoryID uuid.UUID `json:"category_id"`
Name string `json:"name"`
Description *string `json:"description"`
Unit *string `json:"unit"`
Filename string `json:"filename"`
StorageKey string `json:"storage_key"`
// CogStorageKey points to the Cloud-Optimized GeoTIFF for rasters. Nullable.
CogStorageKey *string `json:"cog_storage_key"`
FileType FileType `json:"file_type"`
SizeBytes int64 `json:"size_bytes"`
ContentType string `json:"content_type"`
// Properties holds tabular data extracted from the file (e.g. a shapefile's
// attribute table). Nullable.
Properties json.RawMessage `json:"properties"`
// Meta holds arbitrary user-defined data. Nullable.
Meta json.RawMessage `json:"meta"`
// Automated is a user-defined flag.
Automated bool `json:"automated"`
// Status is the dataset's lifecycle status (see DatasetStatus* constants).
Status string `json:"status"`
// AttributeColumns are the columns detected from the file's attribute table
// (vector_with_kato only). Nullable until parsed.
AttributeColumns []AttributeColumn `json:"attribute_columns"`
// KatoColumn is the user-selected column holding KATO codes. Nullable.
KatoColumn *string `json:"kato_column"`
// YearColumns maps attribute columns to dates. Nullable until mapped.
YearColumns []YearColumn `json:"year_columns"`
// ParseError holds the failure reason when Status is failed. Nullable.
ParseError *string `json:"parse_error"`
// Geometry is the dataset's spatial geometry, serialized as GeoJSON.
// Nullable; populated from the file's spatial data.
Geometry json.RawMessage `json:"geometry"`
// BBox is the axis-aligned bounding box [minX, minY, maxX, maxY] derived
// from the geometry. Included in responses only for raster datasets.
BBox []float64 `json:"bbox,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}