diff --git a/build/package/Dockerfile b/build/package/Dockerfile index 28fd298..e7daab1 100644 --- a/build/package/Dockerfile +++ b/build/package/Dockerfile @@ -11,7 +11,8 @@ RUN CGO_ENABLED=0 GOOS=linux go build -trimpath -o /out/gis ./cmd/gis FROM alpine:3.20 -# gdal-tools provides gdal_translate / gdalinfo for raster COG conversion (worker). +# gdal-tools provides gdal_translate / gdalinfo for raster COG conversion and +# ogr2ogr for extracting vector feature geometry (worker). RUN apk add --no-cache ca-certificates tzdata gdal-tools \ && adduser -D -u 10001 app diff --git a/internal/raster/gdal.go b/internal/raster/gdal.go index 0643140..1e2464d 100644 --- a/internal/raster/gdal.go +++ b/internal/raster/gdal.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "os/exec" + "path/filepath" "strings" ) @@ -56,3 +57,52 @@ func (c *GDALConverter) Footprint(ctx context.Context, src string) ([]byte, erro } return info.Wgs84Extent, nil } + +// VectorGeometry reads every feature of a vector file and returns their combined +// geometry as a GeoJSON GeometryCollection reprojected to EPSG:4326, or nil if +// the file has no features. The caller (PostGIS) dissolves the collection into +// the union of all features. Zipped ESRI shapefiles are read in place via GDAL's +// /vsizip/ virtual filesystem; GeoJSON and GeoPackage are read directly. +func (c *GDALConverter) VectorGeometry(ctx context.Context, src string) ([]byte, error) { + input := src + if strings.EqualFold(filepath.Ext(src), ".zip") { + input = "/vsizip/" + src + } + + cmd := exec.CommandContext(ctx, "ogr2ogr", + "-f", "GeoJSON", + "-t_srs", "EPSG:4326", + "/vsistdout/", input, + ) + var stderr strings.Builder + cmd.Stderr = &stderr + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("ogr2ogr: %w: %s", err, strings.TrimSpace(stderr.String())) + } + + var fc struct { + Features []struct { + Geometry json.RawMessage `json:"geometry"` + } `json:"features"` + } + if err := json.Unmarshal(out, &fc); err != nil { + return nil, fmt.Errorf("parse ogr2ogr output: %w", err) + } + + geoms := make([]json.RawMessage, 0, len(fc.Features)) + for _, f := range fc.Features { + if len(f.Geometry) == 0 || string(f.Geometry) == "null" { + continue + } + geoms = append(geoms, f.Geometry) + } + if len(geoms) == 0 { + return nil, nil + } + + return json.Marshal(struct { + Type string `json:"type"` + Geometries []json.RawMessage `json:"geometries"` + }{Type: "GeometryCollection", Geometries: geoms}) +} diff --git a/internal/repository/postgres/dataset.go b/internal/repository/postgres/dataset.go index dcaed2f..23e64b9 100644 --- a/internal/repository/postgres/dataset.go +++ b/internal/repository/postgres/dataset.go @@ -139,12 +139,23 @@ func (r *DatasetRepository) MarkConverted(ctx context.Context, id uuid.UUID, cog return nil } -// SetProperties stores the extracted attribute table (nil -> NULL) and marks the -// dataset ready. -func (r *DatasetRepository) SetProperties(ctx context.Context, id uuid.UUID, properties []byte) error { +// SetProperties stores the extracted attribute table (nil -> NULL) and the +// dissolved feature geometry (GeoJSON in EPSG:4326; nil keeps the existing +// geometry), then marks the dataset ready. The geometry is reduced to the union +// of all features via ST_UnaryUnion. +func (r *DatasetRepository) SetProperties(ctx context.Context, id uuid.UUID, properties, geometry []byte) error { + var geom any // nil -> SQL NULL -> CASE keeps existing geometry + if len(geometry) > 0 { + geom = string(geometry) + } tag, err := r.pool.Exec(ctx, - `UPDATE datasets SET properties = $2, status = $3, parse_error = NULL, updated_at = now() WHERE id = $1`, - id, nullableJSON(json.RawMessage(properties)), domain.DatasetStatusReady, + `UPDATE datasets + SET properties = $2, + geometry = CASE WHEN $3::text IS NULL THEN geometry + ELSE ST_UnaryUnion(ST_SetSRID(ST_GeomFromGeoJSON($3), 4326)) END, + status = $4, parse_error = NULL, updated_at = now() + WHERE id = $1`, + id, nullableJSON(json.RawMessage(properties)), geom, domain.DatasetStatusReady, ) if err != nil { return mapError(err) diff --git a/internal/service/dataset.go b/internal/service/dataset.go index e0d38f2..dc65273 100644 --- a/internal/service/dataset.go +++ b/internal/service/dataset.go @@ -33,7 +33,7 @@ type DatasetRepository interface { MarkParseFailed(ctx context.Context, id uuid.UUID, reason string) error MarkReady(ctx context.Context, id uuid.UUID) error MarkConverted(ctx context.Context, id uuid.UUID, cogKey string, footprint []byte) error - SetProperties(ctx context.Context, id uuid.UUID, properties []byte) error + SetProperties(ctx context.Context, id uuid.UUID, properties, geometry []byte) error SaveMapping(ctx context.Context, id uuid.UUID, katoColumn string, years []domain.YearColumn) (domain.Dataset, error) ReplaceObservations(ctx context.Context, datasetID uuid.UUID, obs []domain.Observation) error ListObservations(ctx context.Context, datasetID uuid.UUID, katoCode *string, limit, offset int) ([]domain.Observation, error) @@ -82,11 +82,15 @@ type ColumnParser func(filename string, data []byte) ([]domain.AttributeColumn, // RowParser reads every attribute row from a file's raw bytes as name->value maps. type RowParser func(filename string, data []byte) ([]map[string]string, error) -// RasterConverter converts a raster file to a Cloud-Optimized GeoTIFF and reads -// its footprint. It operates on local file paths. +// RasterConverter converts a raster file to a Cloud-Optimized GeoTIFF, reads its +// footprint, and dissolves a vector file's features into a single geometry. It +// operates on local file paths. type RasterConverter interface { ToCOG(ctx context.Context, srcPath, dstPath string) error Footprint(ctx context.Context, srcPath string) ([]byte, error) + // VectorGeometry returns the combined geometry of a vector file's features as + // GeoJSON in EPSG:4326 (nil if the file has no features). + VectorGeometry(ctx context.Context, srcPath string) ([]byte, error) } // UploadInput carries everything needed to store a new dataset. @@ -249,10 +253,12 @@ func (s *DatasetService) Reprocess(ctx context.Context, id uuid.UUID) (domain.Da return dataset, nil } -// ExtractProperties reads a plain vector dataset's attribute table and stores it -// (as a JSON array of row objects) in the properties column, then marks the -// dataset ready. Invoked by the worker. Parse failures are recorded; storage -// failures are returned for retry. +// ExtractProperties reads a plain vector dataset's attribute table and spatial +// geometry and stores them (the attribute table as a JSON array of row objects +// in the properties column, the dissolved feature geometry in the geometry +// column), then marks the dataset ready. Invoked by the worker. Parse failures +// are recorded; storage failures are returned for retry. Geometry extraction is +// best-effort: a failure leaves geometry unset rather than failing the job. func (s *DatasetService) ExtractProperties(ctx context.Context, id uuid.UUID) error { dataset, err := s.repo.GetByID(ctx, id) if err != nil { @@ -278,7 +284,31 @@ func (s *DatasetService) ExtractProperties(ctx context.Context, id uuid.UUID) er return err } } - return s.repo.SetProperties(ctx, id, properties) + + geometry := s.vectorGeometry(ctx, dataset.Filename, data) + return s.repo.SetProperties(ctx, id, properties, geometry) +} + +// vectorGeometry dissolves a vector file's features into a single GeoJSON +// geometry via the converter. It writes the in-memory bytes to a temp file +// (preserving the extension so the converter detects the format) because the +// converter operates on file paths. Best-effort: any failure yields nil. +func (s *DatasetService) vectorGeometry(ctx context.Context, filename string, data []byte) []byte { + ext := strings.ToLower(filepath.Ext(filename)) + f, err := os.CreateTemp("", "gis-vec-*"+ext) + if err != nil { + return nil + } + defer os.Remove(f.Name()) + if _, err := f.Write(data); err != nil { + f.Close() + return nil + } + if err := f.Close(); err != nil { + return nil + } + geom, _ := s.converter.VectorGeometry(ctx, f.Name()) + return geom } // hasAttributeData reports whether any row carries at least one attribute. diff --git a/internal/service/dataset_test.go b/internal/service/dataset_test.go index 452a722..547d601 100644 --- a/internal/service/dataset_test.go +++ b/internal/service/dataset_test.go @@ -125,12 +125,15 @@ func (r *stubDatasetRepo) MarkConverted(_ context.Context, id uuid.UUID, cogKey return nil } -func (r *stubDatasetRepo) SetProperties(_ context.Context, id uuid.UUID, properties []byte) error { +func (r *stubDatasetRepo) SetProperties(_ context.Context, id uuid.UUID, properties, geometry []byte) error { d, ok := r.store[id] if !ok { return domain.ErrNotFound } d.Properties = properties + if len(geometry) > 0 { + d.Geometry = geometry + } d.Status = domain.DatasetStatusReady r.store[id] = d return nil @@ -192,10 +195,12 @@ func (s *stubEnqueuer) EnqueueConvert(_ context.Context, id uuid.UUID) error { // stubConverter records raster conversions. type stubConverter struct { - cogCalls int - toCOGErr error - footprint []byte - footprintFn func(src string) ([]byte, error) + cogCalls int + toCOGErr error + footprint []byte + footprintFn func(src string) ([]byte, error) + vectorGeom []byte + vectorGeomFn func(src string) ([]byte, error) } func (c *stubConverter) ToCOG(_ context.Context, _, dst string) error { @@ -213,6 +218,13 @@ func (c *stubConverter) Footprint(_ context.Context, src string) ([]byte, error) return c.footprint, nil } +func (c *stubConverter) VectorGeometry(_ context.Context, src string) ([]byte, error) { + if c.vectorGeomFn != nil { + return c.vectorGeomFn(src) + } + return c.vectorGeom, nil +} + var ( noopParser ColumnParser = func(string, []byte) ([]domain.AttributeColumn, error) { return nil, nil } noopRowParser RowParser = func(string, []byte) ([]map[string]string, error) { return nil, nil } @@ -428,7 +440,9 @@ func TestDatasetService_ExtractProperties(t *testing.T) { {"name": "Almaty", "pop": "2000"}, } rp := RowParser(func(string, []byte) ([]map[string]string, error) { return rows, nil }) - svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, &stubConverter{}) + geom := []byte(`{"type":"GeometryCollection","geometries":[]}`) + conv := &stubConverter{vectorGeom: geom} + svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, conv) if err := svc.ExtractProperties(context.Background(), id); err != nil { t.Fatalf("unexpected error: %v", err) @@ -444,6 +458,40 @@ func TestDatasetService_ExtractProperties(t *testing.T) { if len(parsed) != 2 { t.Fatalf("want 2 rows in properties, got %d", len(parsed)) } + if string(got.Geometry) != string(geom) { + t.Fatalf("want geometry %s, got %s", geom, got.Geometry) + } +} + +func TestDatasetService_ExtractProperties_GeometryBestEffort(t *testing.T) { + id := uuid.New() + repo := newStubDatasetRepo() + repo.store[id] = domain.Dataset{ + ID: id, FileType: domain.FileTypeVector, + Filename: "d.geojson", StorageKey: "k", Status: domain.DatasetStatusProcessing, + } + rp := RowParser(func(string, []byte) ([]map[string]string, error) { + return []map[string]string{{"name": "Astana"}}, nil + }) + // Geometry extraction fails; the job must still succeed with properties set. + conv := &stubConverter{vectorGeomFn: func(string) ([]byte, error) { + return nil, errors.New("ogr2ogr boom") + }} + svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, conv) + + if err := svc.ExtractProperties(context.Background(), id); err != nil { + t.Fatalf("unexpected error: %v", err) + } + got := repo.store[id] + if got.Status != domain.DatasetStatusReady { + t.Fatalf("want ready, got %q", got.Status) + } + if got.Geometry != nil { + t.Fatalf("expected nil geometry on extraction failure, got %s", got.Geometry) + } + if got.Properties == nil { + t.Fatalf("expected properties to be set despite geometry failure") + } } func TestDatasetService_ExtractProperties_NoTable(t *testing.T) {