gis/internal/parser/shapefile.go

170 lines
4.0 KiB
Go

package parser
import (
"archive/zip"
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"strings"
"unicode/utf8"
"gis/internal/domain"
"golang.org/x/text/encoding/charmap"
)
// readDBF extracts the .dbf bytes from a zipped ESRI shapefile.
func readDBF(data []byte) ([]byte, error) {
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return nil, fmt.Errorf("open zip: %w", err)
}
var dbf *zip.File
for _, f := range zr.File {
if strings.HasSuffix(strings.ToLower(f.Name), ".dbf") {
dbf = f
break
}
}
if dbf == nil {
return nil, errors.New("no .dbf file found in archive")
}
rc, err := dbf.Open()
if err != nil {
return nil, fmt.Errorf("open .dbf: %w", err)
}
defer rc.Close()
raw, err := io.ReadAll(rc)
if err != nil {
return nil, fmt.Errorf("read .dbf: %w", err)
}
return raw, nil
}
// shapefileColumns reads the .dbf attribute columns (with samples).
func shapefileColumns(data []byte) ([]domain.AttributeColumn, error) {
raw, err := readDBF(data)
if err != nil {
return nil, err
}
fields, headerSize, recordLen, err := dbfHeader(raw)
if err != nil {
return nil, err
}
samples := make([][]string, len(fields))
collected := 0
dbfEachRecord(raw, fields, headerSize, recordLen, func(values []string) bool {
for i := range fields {
samples[i] = append(samples[i], values[i])
}
collected++
return collected < sampleRows
})
cols := make([]domain.AttributeColumn, len(fields))
for i, f := range fields {
cols[i] = domain.AttributeColumn{Name: f.name, Samples: samples[i]}
}
return cols, nil
}
// shapefileRows reads every record of the .dbf as a name->value map.
func shapefileRows(data []byte) ([]map[string]string, error) {
raw, err := readDBF(data)
if err != nil {
return nil, err
}
fields, headerSize, recordLen, err := dbfHeader(raw)
if err != nil {
return nil, err
}
var rows []map[string]string
dbfEachRecord(raw, fields, headerSize, recordLen, func(values []string) bool {
row := make(map[string]string, len(fields))
for i, f := range fields {
row[f.name] = values[i]
}
rows = append(rows, row)
return true
})
return rows, nil
}
type dbfField struct {
name string
offset int
length int
}
// dbfHeader parses a dBASE III/IV header into fields plus record geometry.
func dbfHeader(b []byte) (fields []dbfField, headerSize, recordLen int, err error) {
if len(b) < 32 {
return nil, 0, 0, errors.New("dbf too short")
}
headerSize = int(binary.LittleEndian.Uint16(b[8:10]))
recordLen = int(binary.LittleEndian.Uint16(b[10:12]))
recOffset := 1 // first byte of each record is the deletion flag
for off := 32; off+32 <= len(b) && b[off] != 0x0D; off += 32 {
name := decodeText(trimNull(b[off : off+11]))
length := int(b[off+16])
fields = append(fields, dbfField{name: name, offset: recOffset, length: length})
recOffset += length
}
if len(fields) == 0 {
return nil, 0, 0, ErrNoColumns
}
return fields, headerSize, recordLen, nil
}
// dbfEachRecord decodes each non-deleted record's field values (in field order)
// and calls fn; iteration stops when fn returns false.
func dbfEachRecord(b []byte, fields []dbfField, headerSize, recordLen int, fn func(values []string) bool) {
if headerSize <= 0 || recordLen <= 0 {
return
}
for start := headerSize; start+recordLen <= len(b); start += recordLen {
rec := b[start : start+recordLen]
if rec[0] == '*' { // deleted record
continue
}
values := make([]string, len(fields))
for i, f := range fields {
if f.offset+f.length <= len(rec) {
values[i] = strings.TrimSpace(decodeText(rec[f.offset : f.offset+f.length]))
}
}
if !fn(values) {
return
}
}
}
func trimNull(b []byte) []byte {
if i := bytes.IndexByte(b, 0); i >= 0 {
return b[:i]
}
return b
}
// decodeText returns UTF-8 text, falling back to Windows-1251 (common for
// Cyrillic KATO data) when the bytes are not valid UTF-8.
func decodeText(b []byte) string {
if utf8.Valid(b) {
return string(b)
}
if decoded, err := charmap.Windows1251.NewDecoder().Bytes(b); err == nil {
return string(decoded)
}
return string(b)
}