diff --git a/.circleci/config.yml b/.circleci/config.yml index b98f4df..93088c9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,6 +30,7 @@ jobs: github.com/ghodss/yaml github.com/ipfs/go-log gopkg.in/yaml.v2 + github.com/360EntSecGroup-Skylar/excelize - run: name: Run Lint Tests command: golint ./... diff --git a/data_format.go b/data_format.go index 4c1fbc2..a162d20 100644 --- a/data_format.go +++ b/data_format.go @@ -30,9 +30,8 @@ const ( // XMLDataFormat specifies eXtensible Markup Language-formatted data // currently not supported. XMLDataFormat - // XLSDataFormat specifies microsoft excel formatted data - // currently not supported. - XLSDataFormat + // XLSXDataFormat specifies microsoft excel formatted data + XLSXDataFormat ) // SupportedDataFormats gives a slice of data formats that are @@ -44,6 +43,7 @@ func SupportedDataFormats() []DataFormat { CBORDataFormat, JSONDataFormat, CSVDataFormat, + XLSXDataFormat, } } @@ -54,7 +54,7 @@ func (f DataFormat) String() string { CSVDataFormat: "csv", JSONDataFormat: "json", XMLDataFormat: "xml", - XLSDataFormat: "xls", + XLSXDataFormat: "xlsx", CBORDataFormat: "cbor", }[f] @@ -76,8 +76,8 @@ func ParseDataFormatString(s string) (df DataFormat, err error) { "json": JSONDataFormat, ".xml": XMLDataFormat, "xml": XMLDataFormat, - ".xls": XLSDataFormat, - "xls": XLSDataFormat, + ".xlsx": XLSXDataFormat, + "xlsx": XLSXDataFormat, "cbor": CBORDataFormat, ".cbor": CBORDataFormat, }[s] diff --git a/data_format_config.go b/data_format_config.go index 03d1245..cbae96a 100644 --- a/data_format_config.go +++ b/data_format_config.go @@ -20,6 +20,8 @@ func ParseFormatConfigMap(f DataFormat, opts map[string]interface{}) (FormatConf return NewCSVOptions(opts) case JSONDataFormat: return NewJSONOptions(opts) + case XLSXDataFormat: + return NewXLSXOptions(opts) default: return nil, fmt.Errorf("cannot parse configuration for format: %s", f.String()) } @@ -141,3 +143,44 @@ func (o *JSONOptions) Map() map[string]interface{} { } return map[string]interface{}{} } + +// XLSXOptions specifies configuraiton details for the xlsx file format +type XLSXOptions struct { + SheetName string `json:"sheetName,omitempty"` +} + +// NewXLSXOptions creates a XLSXOptions pointer from a map +func NewXLSXOptions(opts map[string]interface{}) (FormatConfig, error) { + o := &XLSXOptions{} + if opts == nil { + return o, nil + } + + if opts["sheetName"] != nil { + if sheetName, ok := opts["sheetName"].(string); ok { + o.SheetName = sheetName + } else { + return nil, fmt.Errorf("invalid sheetName value: %v", opts["sheetName"]) + } + } + + return o, nil +} + +// Format announces the XLSX data format for the FormatConfig interface +func (*XLSXOptions) Format() DataFormat { + return XLSXDataFormat +} + +// Map structures XLSXOptions as a map of string keys to values +func (o *XLSXOptions) Map() map[string]interface{} { + if o == nil { + return nil + } + opt := map[string]interface{}{} + if o.SheetName != "" { + opt["sheetName"] = o.SheetName + } + + return opt +} diff --git a/data_format_config_test.go b/data_format_config_test.go index b512a08..361a1c0 100644 --- a/data_format_config_test.go +++ b/data_format_config_test.go @@ -30,7 +30,7 @@ func TestParseFormatConfigMap(t *testing.T) { }{ {CSVDataFormat, map[string]interface{}{}, &CSVOptions{}, ""}, {JSONDataFormat, map[string]interface{}{}, &JSONOptions{}, ""}, - {XLSDataFormat, map[string]interface{}{}, nil, "cannot parse configuration for format: xls"}, + {XLSXDataFormat, map[string]interface{}{}, &XLSXOptions{}, ""}, } for i, c := range cases { @@ -136,3 +136,56 @@ func TestJSONOptionsMap(t *testing.T) { } } } + +func TestNewXLSXOptions(t *testing.T) { + cases := []struct { + opts map[string]interface{} + res *XLSXOptions + err string + }{ + {nil, &XLSXOptions{}, ""}, + {map[string]interface{}{}, &XLSXOptions{}, ""}, + {map[string]interface{}{"sheetName": "foo"}, &XLSXOptions{SheetName: "foo"}, ""}, + {map[string]interface{}{"sheetName": true}, nil, "invalid sheetName value: true"}, + } + + for i, c := range cases { + got, err := NewXLSXOptions(c.opts) + if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { + t.Errorf("case %d error expected: '%s', got: '%s'", i, c.err, err) + continue + } + if c.err == "" { + xlsxo, ok := got.(*XLSXOptions) + if !ok { + t.Errorf("case %d didn't return a CSVOptions pointer", i) + continue + } + + if xlsxo.SheetName != c.res.SheetName { + t.Errorf("case %d SheetName expected: %s, got: %s", i, xlsxo.SheetName, c.res.SheetName) + continue + } + } + } +} + +func TestXLSXOptionsMap(t *testing.T) { + cases := []struct { + opt *XLSXOptions + res map[string]interface{} + }{ + {nil, nil}, + {&XLSXOptions{}, map[string]interface{}{}}, + {&XLSXOptions{SheetName: "foo"}, map[string]interface{}{"sheetName": "foo"}}, + } + + for i, c := range cases { + got := c.opt.Map() + for key, val := range c.res { + if got[key] != val { + t.Errorf("case %d, key '%s' expected: '%s' got:'%s'", i, key, val, got[key]) + } + } + } +} diff --git a/data_format_test.go b/data_format_test.go index 7670663..63fc1a8 100644 --- a/data_format_test.go +++ b/data_format_test.go @@ -10,6 +10,7 @@ func TestSupportedDataFormats(t *testing.T) { CBORDataFormat, JSONDataFormat, CSVDataFormat, + XLSXDataFormat, } for i, f := range SupportedDataFormats() { @@ -28,7 +29,7 @@ func TestDataFormatString(t *testing.T) { {CSVDataFormat, "csv"}, {JSONDataFormat, "json"}, {XMLDataFormat, "xml"}, - {XLSDataFormat, "xls"}, + {XLSXDataFormat, "xlsx"}, {CBORDataFormat, "cbor"}, } @@ -53,8 +54,8 @@ func TestParseDataFormatString(t *testing.T) { {"json", JSONDataFormat, ""}, {".xml", XMLDataFormat, ""}, {"xml", XMLDataFormat, ""}, - {".xls", XLSDataFormat, ""}, - {"xls", XLSDataFormat, ""}, + {".xlsx", XLSXDataFormat, ""}, + {"xlsx", XLSXDataFormat, ""}, {"cbor", CBORDataFormat, ""}, {".cbor", CBORDataFormat, ""}, } @@ -81,7 +82,7 @@ func TestDataFormatMarshalJSON(t *testing.T) { {CSVDataFormat, []byte(`"csv"`), ""}, {JSONDataFormat, []byte(`"json"`), ""}, {XMLDataFormat, []byte(`"xml"`), ""}, - {XLSDataFormat, []byte(`"xls"`), ""}, + {XLSXDataFormat, []byte(`"xlsx"`), ""}, {CBORDataFormat, []byte(`"cbor"`), ""}, } for i, c := range cases { @@ -106,7 +107,7 @@ func TestDataFormatUnmarshalJSON(t *testing.T) { {[]byte(`"csv"`), CSVDataFormat, ""}, {[]byte(`"json"`), JSONDataFormat, ""}, {[]byte(`"xml"`), XMLDataFormat, ""}, - {[]byte(`"xls"`), XLSDataFormat, ""}, + {[]byte(`"xlsx"`), XLSXDataFormat, ""}, {[]byte(`"cbor"`), CBORDataFormat, ""}, } diff --git a/detect/detect.go b/detect/detect.go index dd30b87..a9c874c 100644 --- a/detect/detect.go +++ b/detect/detect.go @@ -59,8 +59,8 @@ func ExtensionDataFormat(path string) (format dataset.DataFormat, err error) { return dataset.CSVDataFormat, nil case ".xml": return dataset.XMLDataFormat, nil - case ".xls": - return dataset.XLSDataFormat, nil + case ".xlsx": + return dataset.XLSXDataFormat, nil case "": return dataset.UnknownDataFormat, errors.New("no file extension provided") default: diff --git a/detect/detect_test.go b/detect/detect_test.go index 9417a96..1852968 100644 --- a/detect/detect_test.go +++ b/detect/detect_test.go @@ -85,7 +85,7 @@ func TestExtensionDataFormat(t *testing.T) { {"foo/bar/baz.csv", dataset.CSVDataFormat, ""}, {"foo/bar/baz.json", dataset.JSONDataFormat, ""}, {"foo/bar/baz.xml", dataset.XMLDataFormat, ""}, - {"foo/bar/baz.xls", dataset.XLSDataFormat, ""}, + {"foo/bar/baz.xlsx", dataset.XLSXDataFormat, ""}, {"foo/bar/baz.cbor", dataset.CBORDataFormat, ""}, {"foo/bar/baz", dataset.UnknownDataFormat, "no file extension provided"}, {"foo/bar/baz.jpg", dataset.UnknownDataFormat, "unsupported file type: '.jpg'"}, diff --git a/detect/determineFields.go b/detect/determineFields.go index c2024c8..9f97bda 100644 --- a/detect/determineFields.go +++ b/detect/determineFields.go @@ -34,6 +34,8 @@ func Schema(r *dataset.Structure, data io.Reader) (schema map[string]interface{} return JSONSchema(r, data) case dataset.CSVDataFormat: return CSVSchema(r, data) + case dataset.XLSXDataFormat: + return XLSXSchema(r, data) default: err = fmt.Errorf("'%s' is not supported for field detection", r.Format) return diff --git a/detect/xlsx.go b/detect/xlsx.go new file mode 100644 index 0000000..3c0310a --- /dev/null +++ b/detect/xlsx.go @@ -0,0 +1,13 @@ +package detect + +import ( + "io" + + "github.com/qri-io/dataset" +) + +// XLSXSchema determines any schema information for an excel spreadsheet +// TODO (b5): currently unimplemented +func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) { + return dataset.BaseSchemaArray, 0, nil +} diff --git a/dsfs/dataset.go b/dsfs/dataset.go index f312c7f..add6f21 100644 --- a/dsfs/dataset.go +++ b/dsfs/dataset.go @@ -12,12 +12,12 @@ import ( "github.com/libp2p/go-libp2p-crypto" "github.com/multiformats/go-multihash" - "github.com/qri-io/qfs/cafs" "github.com/qri-io/dataset" "github.com/qri-io/dataset/dsio" "github.com/qri-io/dataset/validate" "github.com/qri-io/dsdiff" "github.com/qri-io/qfs" + "github.com/qri-io/qfs/cafs" ) // LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist, @@ -284,6 +284,15 @@ func prepareDataset(store cafs.Filestore, ds, dsPrev *dataset.Dataset, privKey c cleanTitleAndMessage(&ds.Commit.Title, &ds.Commit.Message, diffDescription) + // TODO (b5): this is a hack until we have a better dataset differ + // "Structure: 2 changes" implies that the underlying bytes that represent the + // data has changed, but the acutal data itself hasn't. + // two elements in structure are byte-sensitive: checksum and length + // we. need. better. diffing. tools. + if ds.Commit.Title == "Structure: 2 changes" { + return "", fmt.Errorf("no meaningful changes detected") + } + ds.Commit.Timestamp = Timestamp() sb, _ := ds.SignableBytes() signedBytes, err := privKey.Sign(sb) diff --git a/dsio/cbor.go b/dsio/cbor.go index 3adb00c..e1bd1cc 100644 --- a/dsio/cbor.go +++ b/dsio/cbor.go @@ -21,6 +21,8 @@ type CBORReader struct { length int } +var _ EntryReader = (*CBORReader)(nil) + var ( bigen = binary.BigEndian ) @@ -94,6 +96,12 @@ func (r *CBORReader) ReadEntry() (ent Entry, err error) { return } +// Close finalizes the reader +func (r *CBORReader) Close() error { + // TODO (b5): check if underlying reader is an io.ReadCloser, call close here if so + return nil +} + const ( cborBdFalse byte = 0xf4 + iota cborBdTrue diff --git a/dsio/csv.go b/dsio/csv.go index 7ec90ce..18920a9 100644 --- a/dsio/csv.go +++ b/dsio/csv.go @@ -20,6 +20,8 @@ type CSVReader struct { types []string } +var _ EntryReader = (*CSVReader)(nil) + // NewCSVReader creates a reader from a structure and read source func NewCSVReader(st *dataset.Structure, r io.Reader) *CSVReader { // TODO - handle error @@ -80,6 +82,13 @@ func (r *CSVReader) ReadEntry() (Entry, error) { return Entry{Value: value}, nil } +// Close finalizes the reader +func (r *CSVReader) Close() error { + // TODO (b5): we should retain a reference to the underlying reader & + // check if it's an io.ReadCloser, calling close here if so + return nil +} + // decode uses specified types from structure's schema to cast csv string values to their // intended types. If casting fails because the data is invalid, it's left as a string instead // of causing an error. diff --git a/dsio/dsio.go b/dsio/dsio.go index f627119..f765595 100644 --- a/dsio/dsio.go +++ b/dsio/dsio.go @@ -28,6 +28,8 @@ type EntryReader interface { Structure() *dataset.Structure // ReadVal reads one row of structured data from the reader ReadEntry() (Entry, error) + // Close finalizes the Reader + Close() error } // EntryReadWriter combines EntryWriter and EntryReader behaviors @@ -54,6 +56,8 @@ func NewEntryReader(st *dataset.Structure, r io.Reader) (EntryReader, error) { return NewJSONReader(st, r) case dataset.CSVDataFormat: return NewCSVReader(st, r), nil + case dataset.XLSXDataFormat: + return NewXLSXReader(st, r) case dataset.UnknownDataFormat: err := fmt.Errorf("structure must have a data format") log.Debug(err.Error()) @@ -74,6 +78,8 @@ func NewEntryWriter(st *dataset.Structure, w io.Writer) (EntryWriter, error) { return NewJSONWriter(st, w) case dataset.CSVDataFormat: return NewCSVWriter(st, w), nil + case dataset.XLSXDataFormat: + return NewXLSXWriter(st, w) case dataset.UnknownDataFormat: err := fmt.Errorf("structure must have a data format") log.Debug(err.Error()) diff --git a/dsio/entry_buffer.go b/dsio/entry_buffer.go index 223c607..337adb2 100644 --- a/dsio/entry_buffer.go +++ b/dsio/entry_buffer.go @@ -7,7 +7,7 @@ import ( ) // EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa -// Read and Write are replaced with ReadRow and WriteEntry. It's worth noting +// Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting // that different data formats have idisyncrcies that affect the behavior // of buffers and their output. For example, EntryBuffer won't write things like // CSV header rows or enclosing JSON arrays until after the writer's diff --git a/dsio/identity.go b/dsio/identity.go new file mode 100644 index 0000000..f3c5fe6 --- /dev/null +++ b/dsio/identity.go @@ -0,0 +1,105 @@ +package dsio + +import ( + "fmt" + "io" + + "github.com/qri-io/dataset" +) + +// NewIdentityReader creates an EntryReader from native go types, passed in +// data must be of type []interface{} or map[string]interface{} +func NewIdentityReader(st *dataset.Structure, data interface{}) (*IdentityReader, error) { + r := &IdentityReader{st: st} + + if md, ok := data.(map[string]interface{}); ok { + r.entries = r.iterateMap(md) + } else if sd, ok := data.([]interface{}); ok { + r.entries = r.iterateSlice(sd) + } else { + return nil, fmt.Errorf("cannot create entry reader from type %T", data) + } + + return r, nil +} + +// IdentityReader is a dsio.EntryReader that works with native go types +type IdentityReader struct { + st *dataset.Structure + done bool + entries chan Entry +} + +var _ EntryReader = (*IdentityReader)(nil) + +// Structure gives the structure being read +func (r *IdentityReader) Structure() *dataset.Structure { + return r.st +} + +// ReadEntry reads one row of structured data from the reader +func (r *IdentityReader) ReadEntry() (Entry, error) { + if r.done { + return Entry{}, io.EOF + } + + return <-r.entries, nil +} + +// Close finalizes the reader +func (r *IdentityReader) Close() error { + if !r.done { + // drain channel to prevent leaking goroutine + for !r.done { + <-r.entries + } + } + return nil +} + +func (r *IdentityReader) iterateMap(data map[string]interface{}) chan Entry { + res := make(chan Entry) + + go func() { + for key, val := range data { + res <- Entry{Key: key, Value: val} + } + r.done = true + }() + + return res +} + +func (r *IdentityReader) iterateSlice(data []interface{}) chan Entry { + res := make(chan Entry) + + go func() { + for i, val := range data { + res <- Entry{Index: i, Value: val} + } + r.done = true + }() + + return res +} + +// IdentityWriter is a dsio.EntryWriter that works with native go types +type IdentityWriter struct { + st *dataset.Structure +} + +// Structure gives the structure being written +func (w *IdentityWriter) Structure() *dataset.Structure { + return w.st +} + +// WriteEntry writes one "row" of structured data to the Writer +func (w *IdentityWriter) WriteEntry(e Entry) error { + return nil +} + +// Close finalizes the writer, indicating all entries +// have been written +func (w *IdentityWriter) Close() error { + return nil +} diff --git a/dsio/json.go b/dsio/json.go index d99e038..cfdcaa9 100644 --- a/dsio/json.go +++ b/dsio/json.go @@ -22,6 +22,8 @@ type JSONReader struct { prevSize int // when buffer is extended, remember how much of the old buffer to discard } +var _ EntryReader = (*JSONReader)(nil) + // NewJSONReader creates a reader from a structure and read source func NewJSONReader(st *dataset.Structure, r io.Reader) (*JSONReader, error) { // Huge buffer (a quarter of a MB) to speed up string reads. @@ -115,6 +117,13 @@ func (r *JSONReader) ReadEntry() (Entry, error) { return ent, nil } +// Close finalizes the reader +func (r *JSONReader) Close() error { + // TODO (b5): we should retain a reference to the underlying reader & + // check if it's an io.ReadCloser, calling close here if so + return nil +} + func isWhitespace(ch byte) bool { return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' } diff --git a/dsio/streams.go b/dsio/streams.go index 6e8c420..a3d1c19 100644 --- a/dsio/streams.go +++ b/dsio/streams.go @@ -37,6 +37,12 @@ func (r *PagedReader) ReadEntry() (Entry, error) { return r.Reader.ReadEntry() } +// Close finalizes the writer, indicating no more records +// will be written +func (r *PagedReader) Close() error { + return r.Reader.Close() +} + // Copy reads all entries from the reader and writes them to the writer func Copy(reader EntryReader, writer EntryWriter) error { for { diff --git a/dsio/testdata/xlsx/obj_cell/body.xlsx b/dsio/testdata/xlsx/obj_cell/body.xlsx new file mode 100644 index 0000000..564a142 Binary files /dev/null and b/dsio/testdata/xlsx/obj_cell/body.xlsx differ diff --git a/dsio/testdata/xlsx/simple/body.xlsx b/dsio/testdata/xlsx/simple/body.xlsx new file mode 100644 index 0000000..dc68a59 Binary files /dev/null and b/dsio/testdata/xlsx/simple/body.xlsx differ diff --git a/dsio/xlsx_test.go b/dsio/xlsx_test.go new file mode 100644 index 0000000..ad67976 --- /dev/null +++ b/dsio/xlsx_test.go @@ -0,0 +1,142 @@ +package dsio + +import ( + "bytes" + "os" + "testing" + + "github.com/qri-io/dataset" +) + +var xlsxStruct = &dataset.Structure{ + Format: "xlsx", + FormatConfig: map[string]interface{}{ + "sheetName": "Sheet1", + }, + Schema: map[string]interface{}{ + "type": "array", + "items": map[string]interface{}{ + "type": "array", + "items": []interface{}{ + map[string]interface{}{"title": "col_a", "type": "string"}, + map[string]interface{}{"title": "col_b", "type": "number"}, + map[string]interface{}{"title": "col_c", "type": "integer"}, + map[string]interface{}{"title": "col_d", "type": "boolean"}, + map[string]interface{}{"title": "col_e", "type": "object"}, + map[string]interface{}{"title": "col_f", "type": "array"}, + map[string]interface{}{"title": "col_g", "type": "null"}, + }, + }, + }, +} + +func TestXLSXReader(t *testing.T) { + f, err := os.Open("testdata/xlsx/simple/body.xlsx") + if err != nil { + t.Fatal(err.Error()) + } + + rdr, err := NewEntryReader(xlsxStruct, f) + if err != nil { + t.Errorf("error allocating EntryReader: %s", err.Error()) + return + } + count := 0 + for { + ent, err := rdr.ReadEntry() + if err != nil { + if err.Error() == "EOF" { + break + } + t.Errorf("unexpected error: %s", err.Error()) + return + } + + if arr, ok := ent.Value.([]interface{}); ok { + if len(arr) != 2 { + t.Errorf("invalid row length for row %d. expected %d, got %d", count, 7, len(arr)) + continue + } + } else { + t.Errorf("expected value to []interface{}. got: %#v", ent.Value) + continue + } + + count++ + } + if count != 4 { + t.Errorf("expected: %d rows, got: %d", 4, count) + } +} + +func TestColIndexToLetters(t *testing.T) { + cases := []struct { + in int + expect string + }{ + {0, "A"}, + {25, "Z"}, + {26, "AA"}, + } + for i, c := range cases { + got := ColIndexToLetters(c.in) + if got != c.expect { + t.Errorf("case %d expected: %s, got: %s", i, c.expect, got) + } + } +} + +func TestXLSXWriter(t *testing.T) { + rows := []Entry{ + // TODO - vary up test input + {Value: []interface{}{"a", float64(12), 23, nil}}, + {Value: []interface{}{"a", float64(12), 23, []interface{}{"foo", "bar"}}}, + {Value: []interface{}{"a", float64(12), 23, map[string]interface{}{"foo": "bar"}}}, + {Value: []interface{}{"a", float64(12), int64(23), false}}, + {Value: []interface{}{"a", float64(12), 23, false}}, + } + + buf := &bytes.Buffer{} + rw, err := NewEntryWriter(xlsxStruct, buf) + if err != nil { + t.Errorf("error allocating EntryWriter: %s", err.Error()) + return + } + st := rw.Structure() + if err := dataset.CompareStructures(st, xlsxStruct); err != nil { + t.Errorf("structure mismatch: %s", err.Error()) + return + } + + for i, row := range rows { + if err := rw.WriteEntry(row); err != nil { + t.Errorf("row %d write error: %s", i, err.Error()) + } + } + + if err := rw.Close(); err != nil { + t.Errorf("close reader error: %s", err.Error()) + return + } +} + +func BenchmarkXLSXReader(b *testing.B) { + st := &dataset.Structure{Format: "xlsx", Schema: dataset.BaseSchemaArray} + + for n := 0; n < b.N; n++ { + file, err := os.Open("testdata/movies/data.xlsx") + if err != nil { + b.Errorf("unexpected error: %s", err.Error()) + } + r, err := NewXLSXReader(st, file) + if err != nil { + b.Errorf("unexpected error: %s", err.Error()) + } + for { + _, err = r.ReadEntry() + if err != nil { + break + } + } + } +} diff --git a/dsio/xslx.go b/dsio/xslx.go new file mode 100644 index 0000000..6a869de --- /dev/null +++ b/dsio/xslx.go @@ -0,0 +1,317 @@ +package dsio + +import ( + "encoding/json" + "fmt" + "io" + "strconv" + + "github.com/360EntSecGroup-Skylar/excelize" + "github.com/qri-io/dataset" + "github.com/qri-io/dataset/vals" +) + +// XLSXReader implements the RowReader interface for the XLSX data format +type XLSXReader struct { + err error + st *dataset.Structure + sheetName string + file *excelize.File + r *excelize.Rows + idx int + types []string +} + +// NewXLSXReader creates a reader from a structure and read source +func NewXLSXReader(st *dataset.Structure, r io.Reader) (*XLSXReader, error) { + // TODO - handle error + _, types, _ := terribleHackToGetHeaderRowAndTypes(st) + + rdr := &XLSXReader{ + st: st, + types: types, + } + + // xlsxr := xlsx.NewReader(ReplaceSoloCarriageReturns(r)) + rdr.file, rdr.err = excelize.OpenReader(r) + if rdr.err != nil { + return rdr, rdr.err + } + + if fcg, err := dataset.ParseFormatConfigMap(dataset.XLSXDataFormat, st.FormatConfig); err == nil { + if opts, ok := fcg.(*dataset.XLSXOptions); ok { + rdr.sheetName = opts.SheetName + } + } + if rdr.sheetName == "" { + rdr.sheetName = "Sheet1" + } + + if rdr.err == nil { + rdr.r, rdr.err = rdr.file.Rows(rdr.sheetName) + } + + return rdr, rdr.err +} + +// Structure gives this reader's structure +func (r *XLSXReader) Structure() *dataset.Structure { + return r.st +} + +// ReadEntry reads one XLSX record from the reader +func (r *XLSXReader) ReadEntry() (Entry, error) { + if r.err != nil { + return Entry{}, r.err + } + if !r.r.Next() { + return Entry{}, io.EOF + } + vals, err := r.decode(r.r.Columns()) + if err != nil { + return Entry{}, err + } + ent := Entry{Index: r.idx, Value: vals} + r.idx++ + + return ent, nil +} + +// decode uses specified types from structure's schema to cast xlsx string values to their +// intended types. If casting fails because the data is invalid, it's left as a string instead +// of causing an error. +func (r *XLSXReader) decode(strings []string) ([]interface{}, error) { + vs := make([]interface{}, len(strings)) + types := r.types + if len(types) < len(strings) { + // TODO - fix. for now is types fails to parse we just assume all types + // are strings + types = make([]string, len(strings)) + for i := range types { + types[i] = "string" + } + } + for i, str := range strings { + vs[i] = str + + switch types[i] { + case "number": + if num, err := vals.ParseNumber([]byte(str)); err == nil { + vs[i] = num + } + case "integer": + if num, err := vals.ParseInteger([]byte(str)); err == nil { + vs[i] = num + } + case "boolean": + if b, err := vals.ParseBoolean([]byte(str)); err == nil { + vs[i] = b + } + case "object": + v := map[string]interface{}{} + if err := json.Unmarshal([]byte(str), &v); err == nil { + vs[i] = v + } + case "array": + v := []interface{}{} + if err := json.Unmarshal([]byte(str), &v); err == nil { + vs[i] = v + } + case "null": + vs[i] = nil + } + } + + return vs, nil +} + +// Close finalizes the writer, indicating no more records will be read +func (r *XLSXReader) Close() error { + return nil +} + +// XLSXWriter implements the RowWriter interface for +// XLSX-formatted data +type XLSXWriter struct { + rowsWritten int + sheetName string + f *excelize.File + st *dataset.Structure + w io.Writer + types []string +} + +// NewXLSXWriter creates a Writer from a structure and write destination +func NewXLSXWriter(st *dataset.Structure, w io.Writer) (*XLSXWriter, error) { + // TODO - capture error + _, types, _ := terribleHackToGetHeaderRowAndTypes(st) + + wr := &XLSXWriter{ + st: st, + f: excelize.NewFile(), + types: types, + w: w, + } + + if fcg, err := dataset.ParseFormatConfigMap(dataset.XLSXDataFormat, st.FormatConfig); err == nil { + if opts, ok := fcg.(*dataset.XLSXOptions); ok { + wr.sheetName = opts.SheetName + } + } else { + return nil, err + } + + if wr.sheetName == "" { + wr.sheetName = "Sheet1" + } + + idx := wr.f.NewSheet(wr.sheetName) + wr.f.SetActiveSheet(idx) + + return wr, nil +} + +// Structure gives this writer's structure +func (w *XLSXWriter) Structure() *dataset.Structure { + return w.st +} + +// WriteEntry writes one XLSX record to the writer +func (w *XLSXWriter) WriteEntry(ent Entry) error { + if arr, ok := ent.Value.([]interface{}); ok { + strs, err := encodeStrings(arr) + if err != nil { + log.Debug(err.Error()) + return fmt.Errorf("error encoding entry: %s", err.Error()) + } + for i, str := range strs { + w.f.SetCellValue(w.sheetName, w.axis(i), str) + } + w.rowsWritten++ + return nil + } + return fmt.Errorf("expected array value to write xlsx row. got: %v", ent) +} + +func (w *XLSXWriter) axis(colIDx int) string { + return ColIndexToLetters(colIDx) + strconv.Itoa(w.rowsWritten+1) +} + +// Close finalizes the writer, indicating no more records +// will be written +func (w *XLSXWriter) Close() error { + _, err := w.f.WriteTo(w.w) + return err +} + +func encodeStrings(vs []interface{}) (strs []string, err error) { + strs = make([]string, len(vs)) + for i, v := range vs { + if v == nil { + continue + } + switch x := v.(type) { + case int: + strs[i] = strconv.Itoa(x) + case int64: + strs[i] = strconv.Itoa(int(x)) + case float64: + strs[i] = strconv.FormatFloat(x, 'f', -1, 64) + case bool: + strs[i] = strconv.FormatBool(x) + case string: + strs[i] = x + case []interface{}: + data, err := json.Marshal(x) + if err != nil { + return strs, err + } + strs[i] = string(data) + case map[string]interface{}: + data, err := json.Marshal(x) + if err != nil { + return strs, err + } + strs[i] = string(data) + default: + return strs, fmt.Errorf("unrecognized encoding type: %#v", v) + } + } + return +} + +// ColIndexToLetters is used to convert a zero based, numeric column +// indentifier into a character code. +func ColIndexToLetters(colRef int) string { + parts := intToBase26(colRef) + return formatColumnName(smooshBase26Slice(parts)) +} + +// largestLesserExponent returns the largest exponent of base that does not exceed num +// equivalent to pow(floor(log(num, base)), base) +func largestLesserExponent(num, base int) int { + prev := 1 + exp := base + for num >= exp { + prev = exp + exp = exp * base + } + return prev +} + +// Converts a list of numbers representing a column into a alphabetic +// representation, as used in the spreadsheet. +func formatColumnName(colID []int) string { + lastPart := len(colID) - 1 + + result := "" + for n, part := range colID { + if n == lastPart { + // The least significant number is in the + // range 0-25, all other numbers are 1-26, + // hence we use a differente offset for the + // last part. + result += string(part + 65) + } else { + // Don't output leading 0s, as there is no + // representation of 0 in this format. + if part > 0 { + result += string(part + 64) + } + } + } + return result +} + +func smooshBase26Slice(b26 []int) []int { + // Smoosh values together, eliminating 0s from all but the + // least significant part. + lastButOnePart := len(b26) - 2 + for i := lastButOnePart; i > 0; i-- { + part := b26[i] + if part == 0 { + greaterPart := b26[i-1] + if greaterPart > 0 { + b26[i-1] = greaterPart - 1 + b26[i] = 26 + } + } + } + return b26 +} + +func intToBase26(x int) (parts []int) { + // Excel column codes are pure evil - in essence they're just + // base26, but they don't represent the number 0. + b26Denominator := largestLesserExponent(x, 26) + + // This loop terminates because integer division of 1 / 26 + // returns 0. + for d := b26Denominator; d > 0; d = d / 26 { + value := x / d + remainder := x % d + parts = append(parts, value) + x = remainder + } + return parts +} diff --git a/dsutil/http.go b/dsutil/http.go index 7612adf..3d2ac6a 100644 --- a/dsutil/http.go +++ b/dsutil/http.go @@ -15,10 +15,6 @@ import ( // FormFileDataset extracts a dataset document from a http Request func FormFileDataset(r *http.Request, ds *dataset.Dataset) (err error) { - ds.Peername = r.FormValue("peername") - ds.Name = r.FormValue("name") - ds.BodyPath = r.FormValue("body_path") - datafile, dataHeader, err := r.FormFile("file") if err == http.ErrMissingFile { err = nil @@ -47,6 +43,16 @@ func FormFileDataset(r *http.Request, ds *dataset.Dataset) (err error) { } } + if peername := r.FormValue("peername"); peername != "" { + ds.Peername = peername + } + if name := r.FormValue("name"); name != "" { + ds.Name = name + } + if bp := r.FormValue("body_path"); bp != "" { + ds.BodyPath = bp + } + tfFile, tfHeader, err := r.FormFile("transform") if err == http.ErrMissingFile { err = nil diff --git a/generate/generate.go b/generate/generate.go index a8dc2b5..101d9b8 100644 --- a/generate/generate.go +++ b/generate/generate.go @@ -62,6 +62,11 @@ func (g Generator) Structure() *dataset.Structure { return g.structure } +// Close finalizes the generator +func (g Generator) Close() error { + return nil +} + var alphaNumericRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") // randString generates a random string of alpha numeric characters up to maxLen runes long.