From a6e0c1a3b06e045ac6d1341c44ff3e253320567e Mon Sep 17 00:00:00 2001 From: Shivang Nagta Date: Tue, 16 Jun 2026 14:57:33 +0530 Subject: [PATCH 1/2] FEAT: Add legacy .xls support to converter task --- go.mod | 1 + go.sum | 2 + .../pkg/pipeline/task/converter/README.md | 27 ++- .../pkg/pipeline/task/converter/converter.go | 1 + internal/pkg/pipeline/task/converter/xls.go | 165 ++++++++++++++++++ 5 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 internal/pkg/pipeline/task/converter/xls.go diff --git a/go.mod b/go.mod index d119207..58d2a99 100644 --- a/go.mod +++ b/go.mod @@ -20,6 +20,7 @@ require ( github.com/hamba/avro/v2 v2.24.0 github.com/itchyny/gojq v0.12.17 github.com/jhillyerd/enmime v1.3.0 + github.com/patterninc/grate v0.1.0 github.com/pkg/sftp v1.13.10 github.com/stretchr/testify v1.11.1 github.com/xuri/excelize/v2 v2.10.0 diff --git a/go.sum b/go.sum index ee1cf3f..073450f 100644 --- a/go.sum +++ b/go.sum @@ -371,6 +371,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= +github.com/patterninc/grate v0.1.0 h1:Ut9IaRCgRbb/8llyNBFXYhlp0jLT2qBuqyzVS82xg7o= +github.com/patterninc/grate v0.1.0/go.mod h1:YTlJg1+jv60uyHTHP4+vH+F9IjYZSmsW9Yn2NczIuF0= github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= diff --git a/internal/pkg/pipeline/task/converter/README.md b/internal/pkg/pipeline/task/converter/README.md index 3015559..f25caab 100644 --- a/internal/pkg/pipeline/task/converter/README.md +++ b/internal/pkg/pipeline/task/converter/README.md @@ -1,6 +1,6 @@ # Converter Task -The `converter` task converts data between different formats, supporting CSV, HTML, XLSX (Excel), EML (Email), Protobuf, and other data format transformations. +The `converter` task converts data between different formats, supporting CSV, HTML, XLSX, XLS, EML (Email), Protobuf, and other data format transformations. ## Function @@ -21,7 +21,7 @@ The converter task transforms data between different formats. It receives record |-------|------|---------|-------------| | `name` | string | - | Task name for identification | | `type` | string | `converter` | Must be "converter" | -| `format` | string | - | Format to convert to (csv, html, sst, xlsx, eml, protobuf) | +| `format` | string | - | Format to convert to (csv, html, sst, xlsx, xls, eml, protobuf) | | `delimiter` | string| \t | Used only in sst converter for spliting key and value| ### CSV Format Options @@ -92,7 +92,7 @@ tasks: ### SST Format Options Convert a single line to the SSTable which could be stored on s3 or via file. It expects a single line as input -### XLSX Format Options +### XLSX / XLS Format Options | Field | Type | Default | Description | |-------|------|---------|-------------| @@ -102,14 +102,15 @@ Convert a single line to the SSTable which could be stored on s3 or via file. It | `sanitize_headers` | bool | `false` | If true, normalizes header row values: non-alphanumeric characters are replaced by underscores, leading/trailing underscores are trimmed, and the result is lowercased. Assumes the first unskipped row to be header | | `sanitize_sheet_names` | bool | `false` | If true, normalizes sheet names: non-alphanumeric characters are replaced by underscores, leading/trailing underscores are trimmed, and the result is lowercased before storing in the `xlsx_sheet_name` context key | -**Important:** The XLSX converter emits **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`. +**Important:** Both converters emit **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`. ## Supported Formats The converter supports the following formats: - **CSV**: Converts CSV data to JSON with column mapping and type conversion - **HTML**: Converts HTML to JSON representation with element structure -- **XLSX**: Converts Excel files to CSV format. **Note:** Each sheet in the Excel file is emitted as a separate record with the sheet name stored in the context (key: `xlsx_sheet_name`) +- **XLSX**: Converts modern Excel files to CSV format. **Note:** Each sheet is emitted as a separate record with the sheet name stored in the context (key: `xlsx_sheet_name`) +- **XLS**: Converts legacy Excel 97-2003 files (`.xls`, BIFF8) to CSV format. Same options and per-sheet output as XLSX - **EML**: Converts EML (Email) files to their constituent parts (HTML body, Text body, Attachments) - **Protobuf**: Decodes binary protobuf messages to JSON using a compiled FileDescriptorSet @@ -166,6 +167,20 @@ tasks: only_data: true ``` +### Legacy Excel 97-2003 (.xls) to CSV: +```yaml +tasks: + - name: read_excel + type: file + path: data.xls + - name: convert_excel + type: converter + format: xls + - name: echo + type: echo + only_data: true +``` + ### Excel to CSV conversion (specific sheets): ```yaml tasks: @@ -236,4 +251,4 @@ tasks: - **Database migration**: Convert data for different database systems - **Report generation**: Convert data to report-friendly formats - **Data exchange**: Enable data sharing between different systems -- **ETL workflows**: Transform data as part of extract, transform, load processes \ No newline at end of file +- **ETL workflows**: Transform data as part of extract, transform, load processes diff --git a/internal/pkg/pipeline/task/converter/converter.go b/internal/pkg/pipeline/task/converter/converter.go index 132db64..40827b1 100644 --- a/internal/pkg/pipeline/task/converter/converter.go +++ b/internal/pkg/pipeline/task/converter/converter.go @@ -40,6 +40,7 @@ func (c *core) UnmarshalYAML(unmarshal func(interface{}) error) error { `html`: new(html), `sst`: new(sst), `xlsx`: new(xlsx), + `xls`: new(xls), `eml`: new(eml), `protobuf`: new(protobuf), } diff --git a/internal/pkg/pipeline/task/converter/xls.go b/internal/pkg/pipeline/task/converter/xls.go new file mode 100644 index 0000000..f152af8 --- /dev/null +++ b/internal/pkg/pipeline/task/converter/xls.go @@ -0,0 +1,165 @@ +package converter + +import ( + "bytes" + csvEncoder "encoding/csv" + "fmt" + "os" + + "github.com/patterninc/grate" + gratexls "github.com/patterninc/grate/xls" + + "github.com/patterninc/caterpillar/internal/pkg/textutil" +) + +func init() { + gratexls.HandleHyperlink = gratexls.PreserveDisplayText +} + +type xls struct { + Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"` + SkipRows int `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"` + SkipRowsBySheet map[string]int `yaml:"skip_rows_by_sheet,omitempty" json:"skip_rows_by_sheet,omitempty"` + SanitizeHeaders bool `yaml:"sanitize_headers,omitempty" json:"sanitize_headers,omitempty"` + SanitizeSheetNames bool `yaml:"sanitize_sheet_names,omitempty" json:"sanitize_sheet_names,omitempty"` +} + +func (x *xls) convert(data []byte, _ string) (outputs []converterOutput, err error) { + // recover so a panic does not crash the task. + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic while parsing .xls file: %v", r) + } + }() + + // grate.Open reads from a file path, so spill the in-memory bytes to a temp file. + tmp, err := os.CreateTemp("", "caterpillar-*.xls") + if err != nil { + return nil, fmt.Errorf("creating temp file for .xls: %w", err) + } + defer os.Remove(tmp.Name()) + if _, err = tmp.Write(data); err != nil { + tmp.Close() + return nil, fmt.Errorf("writing temp .xls: %w", err) + } + if err = tmp.Close(); err != nil { + return nil, fmt.Errorf("closing temp .xls: %w", err) + } + + reader, err := grate.Open(tmp.Name()) + if err != nil { + return nil, err + } + defer reader.Close() + + // Get sheets (visible + hidden) + sheets, err := reader.List() + if err != nil { + return nil, err + } + if h, ok := reader.(interface{ ListHidden() ([]string, error) }); ok { + if hidden, herr := h.ListHidden(); herr == nil { + sheets = append(sheets, hidden...) + } + } + if len(sheets) == 0 { + return nil, fmt.Errorf("no sheet found in the excel file") + } + + if len(x.Sheets) > 0 { + sheets = x.Sheets + } + + // Create one output record per sheet + outputs = make([]converterOutput, 0, len(sheets)) + for _, sheet := range sheets { + output, err := x.readSheet(reader, sheet) + if err != nil { + return nil, err + } + + outputs = append(outputs, output) + } + + return outputs, nil +} + +func (x *xls) readSheet(reader grate.Source, sheet string) (converterOutput, error) { + rowsToSkip := x.getRowsToSkip(sheet) + // Create buffer for this sheet + var buff bytes.Buffer + writer := csvEncoder.NewWriter(&buff) + + // Get all rows from the sheet + rows, err := reader.Get(sheet) + if err != nil { + return converterOutput{}, fmt.Errorf("error reading rows from sheet %s: %w", sheet, err) + } + + // grate pads every row to the sheet's max width and emits one trailing empty + // row (the sheet dimension is "last row + 1"). + var allRows [][]string + for rows.Next() { + allRows = append(allRows, trimTrailingEmpty(rows.Strings())) + } + for len(allRows) > 0 && len(allRows[len(allRows)-1]) == 0 { + allRows = allRows[:len(allRows)-1] + } + + // Write rows to buffer + isHeaderRow := true + for i, cols := range allRows { + if i < rowsToSkip { + continue + } + + if x.SanitizeHeaders && isHeaderRow { + for j, col := range cols { + cols[j] = textutil.Slugify(col) + } + isHeaderRow = false + } + + if err := writer.Write(cols); err != nil { + return converterOutput{}, err + } + } + + // Flush the writer + writer.Flush() + + outputSheetName := sheet + if x.SanitizeSheetNames { + outputSheetName = textutil.Slugify(sheet) + } + + return converterOutput{ + Data: buff.Bytes(), + Metadata: map[string]string{ + sheetName: outputSheetName, + }, + }, nil +} + +func (x *xls) getRowsToSkip(sheet string) int { + rowsToSkip := x.SkipRows + if x.SkipRowsBySheet != nil { + if val, found := x.SkipRowsBySheet[sheet]; found { + rowsToSkip = val + } + } + + if rowsToSkip < 0 { + rowsToSkip = 0 + } + + return rowsToSkip +} + +func trimTrailingEmpty(row []string) []string { + end := len(row) + for end > 0 && row[end-1] == "" { + end-- + } + return row[:end] +} From 303ad6fd8fa674f179eac5135e1b4ec7ef3dbbe6 Mon Sep 17 00:00:00 2001 From: Shivang Nagta Date: Thu, 18 Jun 2026 17:05:22 +0530 Subject: [PATCH 2/2] Change xls library from grate to xlrd-go --- go.mod | 4 +- go.sum | 12 +- internal/pkg/pipeline/task/converter/xls.go | 189 +++++++++++++++----- 3 files changed, 150 insertions(+), 55 deletions(-) diff --git a/go.mod b/go.mod index 58d2a99..cb13bbc 100644 --- a/go.mod +++ b/go.mod @@ -20,10 +20,10 @@ require ( github.com/hamba/avro/v2 v2.24.0 github.com/itchyny/gojq v0.12.17 github.com/jhillyerd/enmime v1.3.0 - github.com/patterninc/grate v0.1.0 github.com/pkg/sftp v1.13.10 github.com/stretchr/testify v1.11.1 github.com/xuri/excelize/v2 v2.10.0 + github.com/yamitzky/xlrd-go v0.1.0 golang.org/x/crypto v0.45.0 golang.org/x/net v0.47.0 google.golang.org/protobuf v1.36.9 @@ -85,7 +85,7 @@ require ( go.yaml.in/yaml/v2 v2.4.3 // indirect golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/sys v0.38.0 // indirect - golang.org/x/text v0.31.0 // indirect + golang.org/x/text v0.32.0 // indirect ) require ( diff --git a/go.sum b/go.sum index 073450f..e6b6772 100644 --- a/go.sum +++ b/go.sum @@ -371,8 +371,6 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= -github.com/patterninc/grate v0.1.0 h1:Ut9IaRCgRbb/8llyNBFXYhlp0jLT2qBuqyzVS82xg7o= -github.com/patterninc/grate v0.1.0/go.mod h1:YTlJg1+jv60uyHTHP4+vH+F9IjYZSmsW9Yn2NczIuF0= github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= @@ -479,6 +477,8 @@ github.com/xuri/excelize/v2 v2.10.0 h1:8aKsP7JD39iKLc6dH5Tw3dgV3sPRh8uRVXu/fMstf github.com/xuri/excelize/v2 v2.10.0/go.mod h1:SC5TzhQkaOsTWpANfm+7bJCldzcnU/jrhqkTi/iBHBU= github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE= github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ= +github.com/yamitzky/xlrd-go v0.1.0 h1:WPrLvRMz/ob+ZmEWMmbg/TtrUVh2BTCGGzbqRzsrYBU= +github.com/yamitzky/xlrd-go v0.1.0/go.mod h1:qH3XYtKvWAvhH87qmIDY6YgxAXKAyLD28jpum/PLS7k= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= @@ -568,8 +568,8 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= -golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -606,8 +606,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= -golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/internal/pkg/pipeline/task/converter/xls.go b/internal/pkg/pipeline/task/converter/xls.go index f152af8..811bb06 100644 --- a/internal/pkg/pipeline/task/converter/xls.go +++ b/internal/pkg/pipeline/task/converter/xls.go @@ -4,18 +4,15 @@ import ( "bytes" csvEncoder "encoding/csv" "fmt" - "os" + "io" + "math" + "strconv" - "github.com/patterninc/grate" - gratexls "github.com/patterninc/grate/xls" + "github.com/yamitzky/xlrd-go/xlrd" "github.com/patterninc/caterpillar/internal/pkg/textutil" ) -func init() { - gratexls.HandleHyperlink = gratexls.PreserveDisplayText -} - type xls struct { Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"` SkipRows int `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"` @@ -25,43 +22,24 @@ type xls struct { } func (x *xls) convert(data []byte, _ string) (outputs []converterOutput, err error) { - // recover so a panic does not crash the task. + // recover to avoid crash due to panic. defer func() { if r := recover(); r != nil { err = fmt.Errorf("panic while parsing .xls file: %v", r) } }() - // grate.Open reads from a file path, so spill the in-memory bytes to a temp file. - tmp, err := os.CreateTemp("", "caterpillar-*.xls") - if err != nil { - return nil, fmt.Errorf("creating temp file for .xls: %w", err) - } - defer os.Remove(tmp.Name()) - if _, err = tmp.Write(data); err != nil { - tmp.Close() - return nil, fmt.Errorf("writing temp .xls: %w", err) - } - if err = tmp.Close(); err != nil { - return nil, fmt.Errorf("closing temp .xls: %w", err) - } - - reader, err := grate.Open(tmp.Name()) + // Logfile defaults to stdout, so redirect its diagnostics to avoid polluting task output. + reader, err := xlrd.OpenWorkbook(``, &xlrd.OpenWorkbookOptions{ + FileContents: data, + Logfile: io.Discard, + }) if err != nil { return nil, err } - defer reader.Close() - // Get sheets (visible + hidden) - sheets, err := reader.List() - if err != nil { - return nil, err - } - if h, ok := reader.(interface{ ListHidden() ([]string, error) }); ok { - if hidden, herr := h.ListHidden(); herr == nil { - sheets = append(sheets, hidden...) - } - } + // Get sheets + sheets := reader.SheetNames() if len(sheets) == 0 { return nil, fmt.Errorf("no sheet found in the excel file") } @@ -72,6 +50,7 @@ func (x *xls) convert(data []byte, _ string) (outputs []converterOutput, err err // Create one output record per sheet outputs = make([]converterOutput, 0, len(sheets)) + for _, sheet := range sheets { output, err := x.readSheet(reader, sheet) if err != nil { @@ -84,31 +63,23 @@ func (x *xls) convert(data []byte, _ string) (outputs []converterOutput, err err return outputs, nil } -func (x *xls) readSheet(reader grate.Source, sheet string) (converterOutput, error) { +func (x *xls) readSheet(reader *xlrd.Book, sheet string) (converterOutput, error) { rowsToSkip := x.getRowsToSkip(sheet) // Create buffer for this sheet var buff bytes.Buffer writer := csvEncoder.NewWriter(&buff) // Get all rows from the sheet - rows, err := reader.Get(sheet) + // Unlike xlsx, which uses excelise, there is no api to get formatted rows as output, + // so we require custom formatting over the cells based on their format types + rows, err := x.sheetRows(reader, sheet) if err != nil { return converterOutput{}, fmt.Errorf("error reading rows from sheet %s: %w", sheet, err) } - // grate pads every row to the sheet's max width and emits one trailing empty - // row (the sheet dimension is "last row + 1"). - var allRows [][]string - for rows.Next() { - allRows = append(allRows, trimTrailingEmpty(rows.Strings())) - } - for len(allRows) > 0 && len(allRows[len(allRows)-1]) == 0 { - allRows = allRows[:len(allRows)-1] - } - // Write rows to buffer isHeaderRow := true - for i, cols := range allRows { + for i, cols := range rows { if i < rowsToSkip { continue } @@ -127,6 +98,9 @@ func (x *xls) readSheet(reader grate.Source, sheet string) (converterOutput, err // Flush the writer writer.Flush() + if err := writer.Error(); err != nil { + return converterOutput{}, err + } outputSheetName := sheet if x.SanitizeSheetNames { @@ -141,6 +115,29 @@ func (x *xls) readSheet(reader grate.Source, sheet string) (converterOutput, err }, nil } +// -------------------------- Everything below are helper functions ----------------------------// + +func (x *xls) sheetRows(reader *xlrd.Book, sheet string) ([][]string, error) { + sh, err := reader.SheetByName(sheet) + if err != nil { + return nil, err + } + + rows := make([][]string, 0, sh.NRows) + for r := 0; r < sh.NRows; r++ { + cols := make([]string, 0, sh.NCols) + for c := 0; c < sh.NCols; c++ { + cols = append(cols, cellString(reader, sh, r, c)) + } + rows = append(rows, trimTrailingEmpty(cols)) + } + for len(rows) > 0 && len(rows[len(rows)-1]) == 0 { + rows = rows[:len(rows)-1] + } + + return rows, nil +} + func (x *xls) getRowsToSkip(sheet string) int { rowsToSkip := x.SkipRows if x.SkipRowsBySheet != nil { @@ -156,6 +153,104 @@ func (x *xls) getRowsToSkip(sheet string) int { return rowsToSkip } +// cellString renders a cell to its CSV text +func cellString(book *xlrd.Book, sheet *xlrd.Sheet, r, c int) string { + switch sheet.RawCellType(r, c) { + case xlrd.XL_CELL_TEXT: + if s, ok := sheet.RawCellValue(r, c).(string); ok { + return s + } + case xlrd.XL_CELL_NUMBER: + f, ok := sheet.RawCellValue(r, c).(float64) + if !ok { + return `` + } + if isDateCell(book, sheet.RawCellXFIndex(r, c)) { + if s, ok := formatDate(f, book.Datemode); ok { + return s + } + } + return formatNumber(f) + case xlrd.XL_CELL_BOOLEAN: + switch v := sheet.RawCellValue(r, c).(type) { + case int: + return boolText(v != 0) + case bool: + return boolText(v) + } + case xlrd.XL_CELL_ERROR: + // xlrd-go stores the raw BIFF error code, so we emit "#ERR". + return fmt.Sprintf(`#ERR%v`, sheet.RawCellValue(r, c)) + } + return `` +} + +// isDateCell reports whether a numeric cell carries a date/time number format +func isDateCell(book *xlrd.Book, xfIndex int) bool { + if xfIndex < 0 || xfIndex >= len(book.XFList) { + return false + } + formatKey := book.XFList[xfIndex].FormatKey + if isBuiltinDateFormat(formatKey) { + return true + } + if book.FormatMap == nil { + return false + } + format := book.FormatMap[formatKey] + if format == nil || format.FormatString == `` { + return false + } + return xlrd.IsDateFormatString(book, format.FormatString) +} + +// isBuiltinDateFormat reports whether a built-in number-format key is a date/time +// format. Ranges match excelize +func isBuiltinDateFormat(key int) bool { + switch { + case key >= 14 && key <= 22, + key >= 27 && key <= 36, + key >= 45 && key <= 47, + key >= 50 && key <= 58, + key >= 71 && key <= 81: + return true + default: + return false + } +} + +func formatDate(value float64, datemode int) (string, bool) { + if math.IsNaN(value) || math.IsInf(value, 0) { + return ``, false + } + t, err := xlrd.XldateAsDatetime(value, datemode) + if err != nil { + return ``, false + } + switch { + case value >= 0 && value < 1: + return t.Format(`15:04:05`), true + case value != math.Floor(value): + return t.Format(`2006-01-02 15:04:05`), true + default: + return t.Format(`2006-01-02`), true + } +} + +func boolText(b bool) string { + if b { + return `TRUE` + } + return `FALSE` +} + +func formatNumber(f float64) string { + if f == math.Trunc(f) && !math.IsInf(f, 0) && !math.IsNaN(f) && math.Abs(f) < 1e18 { + return strconv.FormatInt(int64(f), 10) + } + return strconv.FormatFloat(f, 'f', -1, 64) +} + func trimTrailingEmpty(row []string) []string { end := len(row) for end > 0 && row[end-1] == "" {