fix: Fix geometry parsing

This commit is contained in:
Bakhtiyar Issakhmetov 2026-06-28 00:37:29 +05:00
parent 7c469a524b
commit 1e71b94fdb
5 changed files with 160 additions and 20 deletions

View File

@ -11,7 +11,8 @@ RUN CGO_ENABLED=0 GOOS=linux go build -trimpath -o /out/gis ./cmd/gis
FROM alpine:3.20 FROM alpine:3.20
# gdal-tools provides gdal_translate / gdalinfo for raster COG conversion (worker). # gdal-tools provides gdal_translate / gdalinfo for raster COG conversion and
# ogr2ogr for extracting vector feature geometry (worker).
RUN apk add --no-cache ca-certificates tzdata gdal-tools \ RUN apk add --no-cache ca-certificates tzdata gdal-tools \
&& adduser -D -u 10001 app && adduser -D -u 10001 app

View File

@ -8,6 +8,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os/exec" "os/exec"
"path/filepath"
"strings" "strings"
) )
@ -56,3 +57,52 @@ func (c *GDALConverter) Footprint(ctx context.Context, src string) ([]byte, erro
} }
return info.Wgs84Extent, nil return info.Wgs84Extent, nil
} }
// VectorGeometry reads every feature of a vector file and returns their combined
// geometry as a GeoJSON GeometryCollection reprojected to EPSG:4326, or nil if
// the file has no features. The caller (PostGIS) dissolves the collection into
// the union of all features. Zipped ESRI shapefiles are read in place via GDAL's
// /vsizip/ virtual filesystem; GeoJSON and GeoPackage are read directly.
func (c *GDALConverter) VectorGeometry(ctx context.Context, src string) ([]byte, error) {
input := src
if strings.EqualFold(filepath.Ext(src), ".zip") {
input = "/vsizip/" + src
}
cmd := exec.CommandContext(ctx, "ogr2ogr",
"-f", "GeoJSON",
"-t_srs", "EPSG:4326",
"/vsistdout/", input,
)
var stderr strings.Builder
cmd.Stderr = &stderr
out, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("ogr2ogr: %w: %s", err, strings.TrimSpace(stderr.String()))
}
var fc struct {
Features []struct {
Geometry json.RawMessage `json:"geometry"`
} `json:"features"`
}
if err := json.Unmarshal(out, &fc); err != nil {
return nil, fmt.Errorf("parse ogr2ogr output: %w", err)
}
geoms := make([]json.RawMessage, 0, len(fc.Features))
for _, f := range fc.Features {
if len(f.Geometry) == 0 || string(f.Geometry) == "null" {
continue
}
geoms = append(geoms, f.Geometry)
}
if len(geoms) == 0 {
return nil, nil
}
return json.Marshal(struct {
Type string `json:"type"`
Geometries []json.RawMessage `json:"geometries"`
}{Type: "GeometryCollection", Geometries: geoms})
}

View File

@ -139,12 +139,23 @@ func (r *DatasetRepository) MarkConverted(ctx context.Context, id uuid.UUID, cog
return nil return nil
} }
// SetProperties stores the extracted attribute table (nil -> NULL) and marks the // SetProperties stores the extracted attribute table (nil -> NULL) and the
// dataset ready. // dissolved feature geometry (GeoJSON in EPSG:4326; nil keeps the existing
func (r *DatasetRepository) SetProperties(ctx context.Context, id uuid.UUID, properties []byte) error { // geometry), then marks the dataset ready. The geometry is reduced to the union
// of all features via ST_UnaryUnion.
func (r *DatasetRepository) SetProperties(ctx context.Context, id uuid.UUID, properties, geometry []byte) error {
var geom any // nil -> SQL NULL -> CASE keeps existing geometry
if len(geometry) > 0 {
geom = string(geometry)
}
tag, err := r.pool.Exec(ctx, tag, err := r.pool.Exec(ctx,
`UPDATE datasets SET properties = $2, status = $3, parse_error = NULL, updated_at = now() WHERE id = $1`, `UPDATE datasets
id, nullableJSON(json.RawMessage(properties)), domain.DatasetStatusReady, SET properties = $2,
geometry = CASE WHEN $3::text IS NULL THEN geometry
ELSE ST_UnaryUnion(ST_SetSRID(ST_GeomFromGeoJSON($3), 4326)) END,
status = $4, parse_error = NULL, updated_at = now()
WHERE id = $1`,
id, nullableJSON(json.RawMessage(properties)), geom, domain.DatasetStatusReady,
) )
if err != nil { if err != nil {
return mapError(err) return mapError(err)

View File

@ -33,7 +33,7 @@ type DatasetRepository interface {
MarkParseFailed(ctx context.Context, id uuid.UUID, reason string) error MarkParseFailed(ctx context.Context, id uuid.UUID, reason string) error
MarkReady(ctx context.Context, id uuid.UUID) error MarkReady(ctx context.Context, id uuid.UUID) error
MarkConverted(ctx context.Context, id uuid.UUID, cogKey string, footprint []byte) error MarkConverted(ctx context.Context, id uuid.UUID, cogKey string, footprint []byte) error
SetProperties(ctx context.Context, id uuid.UUID, properties []byte) error SetProperties(ctx context.Context, id uuid.UUID, properties, geometry []byte) error
SaveMapping(ctx context.Context, id uuid.UUID, katoColumn string, years []domain.YearColumn) (domain.Dataset, error) SaveMapping(ctx context.Context, id uuid.UUID, katoColumn string, years []domain.YearColumn) (domain.Dataset, error)
ReplaceObservations(ctx context.Context, datasetID uuid.UUID, obs []domain.Observation) error ReplaceObservations(ctx context.Context, datasetID uuid.UUID, obs []domain.Observation) error
ListObservations(ctx context.Context, datasetID uuid.UUID, katoCode *string, limit, offset int) ([]domain.Observation, error) ListObservations(ctx context.Context, datasetID uuid.UUID, katoCode *string, limit, offset int) ([]domain.Observation, error)
@ -82,11 +82,15 @@ type ColumnParser func(filename string, data []byte) ([]domain.AttributeColumn,
// RowParser reads every attribute row from a file's raw bytes as name->value maps. // RowParser reads every attribute row from a file's raw bytes as name->value maps.
type RowParser func(filename string, data []byte) ([]map[string]string, error) type RowParser func(filename string, data []byte) ([]map[string]string, error)
// RasterConverter converts a raster file to a Cloud-Optimized GeoTIFF and reads // RasterConverter converts a raster file to a Cloud-Optimized GeoTIFF, reads its
// its footprint. It operates on local file paths. // footprint, and dissolves a vector file's features into a single geometry. It
// operates on local file paths.
type RasterConverter interface { type RasterConverter interface {
ToCOG(ctx context.Context, srcPath, dstPath string) error ToCOG(ctx context.Context, srcPath, dstPath string) error
Footprint(ctx context.Context, srcPath string) ([]byte, error) Footprint(ctx context.Context, srcPath string) ([]byte, error)
// VectorGeometry returns the combined geometry of a vector file's features as
// GeoJSON in EPSG:4326 (nil if the file has no features).
VectorGeometry(ctx context.Context, srcPath string) ([]byte, error)
} }
// UploadInput carries everything needed to store a new dataset. // UploadInput carries everything needed to store a new dataset.
@ -249,10 +253,12 @@ func (s *DatasetService) Reprocess(ctx context.Context, id uuid.UUID) (domain.Da
return dataset, nil return dataset, nil
} }
// ExtractProperties reads a plain vector dataset's attribute table and stores it // ExtractProperties reads a plain vector dataset's attribute table and spatial
// (as a JSON array of row objects) in the properties column, then marks the // geometry and stores them (the attribute table as a JSON array of row objects
// dataset ready. Invoked by the worker. Parse failures are recorded; storage // in the properties column, the dissolved feature geometry in the geometry
// failures are returned for retry. // column), then marks the dataset ready. Invoked by the worker. Parse failures
// are recorded; storage failures are returned for retry. Geometry extraction is
// best-effort: a failure leaves geometry unset rather than failing the job.
func (s *DatasetService) ExtractProperties(ctx context.Context, id uuid.UUID) error { func (s *DatasetService) ExtractProperties(ctx context.Context, id uuid.UUID) error {
dataset, err := s.repo.GetByID(ctx, id) dataset, err := s.repo.GetByID(ctx, id)
if err != nil { if err != nil {
@ -278,7 +284,31 @@ func (s *DatasetService) ExtractProperties(ctx context.Context, id uuid.UUID) er
return err return err
} }
} }
return s.repo.SetProperties(ctx, id, properties)
geometry := s.vectorGeometry(ctx, dataset.Filename, data)
return s.repo.SetProperties(ctx, id, properties, geometry)
}
// vectorGeometry dissolves a vector file's features into a single GeoJSON
// geometry via the converter. It writes the in-memory bytes to a temp file
// (preserving the extension so the converter detects the format) because the
// converter operates on file paths. Best-effort: any failure yields nil.
func (s *DatasetService) vectorGeometry(ctx context.Context, filename string, data []byte) []byte {
ext := strings.ToLower(filepath.Ext(filename))
f, err := os.CreateTemp("", "gis-vec-*"+ext)
if err != nil {
return nil
}
defer os.Remove(f.Name())
if _, err := f.Write(data); err != nil {
f.Close()
return nil
}
if err := f.Close(); err != nil {
return nil
}
geom, _ := s.converter.VectorGeometry(ctx, f.Name())
return geom
} }
// hasAttributeData reports whether any row carries at least one attribute. // hasAttributeData reports whether any row carries at least one attribute.

View File

@ -125,12 +125,15 @@ func (r *stubDatasetRepo) MarkConverted(_ context.Context, id uuid.UUID, cogKey
return nil return nil
} }
func (r *stubDatasetRepo) SetProperties(_ context.Context, id uuid.UUID, properties []byte) error { func (r *stubDatasetRepo) SetProperties(_ context.Context, id uuid.UUID, properties, geometry []byte) error {
d, ok := r.store[id] d, ok := r.store[id]
if !ok { if !ok {
return domain.ErrNotFound return domain.ErrNotFound
} }
d.Properties = properties d.Properties = properties
if len(geometry) > 0 {
d.Geometry = geometry
}
d.Status = domain.DatasetStatusReady d.Status = domain.DatasetStatusReady
r.store[id] = d r.store[id] = d
return nil return nil
@ -196,6 +199,8 @@ type stubConverter struct {
toCOGErr error toCOGErr error
footprint []byte footprint []byte
footprintFn func(src string) ([]byte, error) footprintFn func(src string) ([]byte, error)
vectorGeom []byte
vectorGeomFn func(src string) ([]byte, error)
} }
func (c *stubConverter) ToCOG(_ context.Context, _, dst string) error { func (c *stubConverter) ToCOG(_ context.Context, _, dst string) error {
@ -213,6 +218,13 @@ func (c *stubConverter) Footprint(_ context.Context, src string) ([]byte, error)
return c.footprint, nil return c.footprint, nil
} }
func (c *stubConverter) VectorGeometry(_ context.Context, src string) ([]byte, error) {
if c.vectorGeomFn != nil {
return c.vectorGeomFn(src)
}
return c.vectorGeom, nil
}
var ( var (
noopParser ColumnParser = func(string, []byte) ([]domain.AttributeColumn, error) { return nil, nil } noopParser ColumnParser = func(string, []byte) ([]domain.AttributeColumn, error) { return nil, nil }
noopRowParser RowParser = func(string, []byte) ([]map[string]string, error) { return nil, nil } noopRowParser RowParser = func(string, []byte) ([]map[string]string, error) { return nil, nil }
@ -428,7 +440,9 @@ func TestDatasetService_ExtractProperties(t *testing.T) {
{"name": "Almaty", "pop": "2000"}, {"name": "Almaty", "pop": "2000"},
} }
rp := RowParser(func(string, []byte) ([]map[string]string, error) { return rows, nil }) rp := RowParser(func(string, []byte) ([]map[string]string, error) { return rows, nil })
svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, &stubConverter{}) geom := []byte(`{"type":"GeometryCollection","geometries":[]}`)
conv := &stubConverter{vectorGeom: geom}
svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, conv)
if err := svc.ExtractProperties(context.Background(), id); err != nil { if err := svc.ExtractProperties(context.Background(), id); err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
@ -444,6 +458,40 @@ func TestDatasetService_ExtractProperties(t *testing.T) {
if len(parsed) != 2 { if len(parsed) != 2 {
t.Fatalf("want 2 rows in properties, got %d", len(parsed)) t.Fatalf("want 2 rows in properties, got %d", len(parsed))
} }
if string(got.Geometry) != string(geom) {
t.Fatalf("want geometry %s, got %s", geom, got.Geometry)
}
}
func TestDatasetService_ExtractProperties_GeometryBestEffort(t *testing.T) {
id := uuid.New()
repo := newStubDatasetRepo()
repo.store[id] = domain.Dataset{
ID: id, FileType: domain.FileTypeVector,
Filename: "d.geojson", StorageKey: "k", Status: domain.DatasetStatusProcessing,
}
rp := RowParser(func(string, []byte) ([]map[string]string, error) {
return []map[string]string{{"name": "Astana"}}, nil
})
// Geometry extraction fails; the job must still succeed with properties set.
conv := &stubConverter{vectorGeomFn: func(string) ([]byte, error) {
return nil, errors.New("ogr2ogr boom")
}}
svc := NewDatasetService(repo, &stubStore{}, stubCategoryReader{exists: true}, &stubEnqueuer{}, noopParser, rp, conv)
if err := svc.ExtractProperties(context.Background(), id); err != nil {
t.Fatalf("unexpected error: %v", err)
}
got := repo.store[id]
if got.Status != domain.DatasetStatusReady {
t.Fatalf("want ready, got %q", got.Status)
}
if got.Geometry != nil {
t.Fatalf("expected nil geometry on extraction failure, got %s", got.Geometry)
}
if got.Properties == nil {
t.Fatalf("expected properties to be set despite geometry failure")
}
} }
func TestDatasetService_ExtractProperties_NoTable(t *testing.T) { func TestDatasetService_ExtractProperties_NoTable(t *testing.T) {