Skip to content

Commit

Permalink
feat(xlsx): initial support for xlsx data format, identity dsio
Browse files Browse the repository at this point in the history
Merge pull request #170 from qri-io/xlsx
  • Loading branch information
b5 authored Feb 5, 2019
2 parents bd4df7a + a356f38 commit d20b4dc
Show file tree
Hide file tree
Showing 23 changed files with 756 additions and 21 deletions.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
github.com/ghodss/yaml
github.com/ipfs/go-log
gopkg.in/yaml.v2
github.com/360EntSecGroup-Skylar/excelize
- run:
name: Run Lint Tests
command: golint ./...
Expand Down
12 changes: 6 additions & 6 deletions data_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ const (
// XMLDataFormat specifies eXtensible Markup Language-formatted data
// currently not supported.
XMLDataFormat
// XLSDataFormat specifies microsoft excel formatted data
// currently not supported.
XLSDataFormat
// XLSXDataFormat specifies microsoft excel formatted data
XLSXDataFormat
)

// SupportedDataFormats gives a slice of data formats that are
Expand All @@ -44,6 +43,7 @@ func SupportedDataFormats() []DataFormat {
CBORDataFormat,
JSONDataFormat,
CSVDataFormat,
XLSXDataFormat,
}
}

Expand All @@ -54,7 +54,7 @@ func (f DataFormat) String() string {
CSVDataFormat: "csv",
JSONDataFormat: "json",
XMLDataFormat: "xml",
XLSDataFormat: "xls",
XLSXDataFormat: "xlsx",
CBORDataFormat: "cbor",
}[f]

Expand All @@ -76,8 +76,8 @@ func ParseDataFormatString(s string) (df DataFormat, err error) {
"json": JSONDataFormat,
".xml": XMLDataFormat,
"xml": XMLDataFormat,
".xls": XLSDataFormat,
"xls": XLSDataFormat,
".xlsx": XLSXDataFormat,
"xlsx": XLSXDataFormat,
"cbor": CBORDataFormat,
".cbor": CBORDataFormat,
}[s]
Expand Down
43 changes: 43 additions & 0 deletions data_format_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ func ParseFormatConfigMap(f DataFormat, opts map[string]interface{}) (FormatConf
return NewCSVOptions(opts)
case JSONDataFormat:
return NewJSONOptions(opts)
case XLSXDataFormat:
return NewXLSXOptions(opts)
default:
return nil, fmt.Errorf("cannot parse configuration for format: %s", f.String())
}
Expand Down Expand Up @@ -141,3 +143,44 @@ func (o *JSONOptions) Map() map[string]interface{} {
}
return map[string]interface{}{}
}

// XLSXOptions specifies configuraiton details for the xlsx file format
type XLSXOptions struct {
SheetName string `json:"sheetName,omitempty"`
}

// NewXLSXOptions creates a XLSXOptions pointer from a map
func NewXLSXOptions(opts map[string]interface{}) (FormatConfig, error) {
o := &XLSXOptions{}
if opts == nil {
return o, nil
}

if opts["sheetName"] != nil {
if sheetName, ok := opts["sheetName"].(string); ok {
o.SheetName = sheetName
} else {
return nil, fmt.Errorf("invalid sheetName value: %v", opts["sheetName"])
}
}

return o, nil
}

// Format announces the XLSX data format for the FormatConfig interface
func (*XLSXOptions) Format() DataFormat {
return XLSXDataFormat
}

// Map structures XLSXOptions as a map of string keys to values
func (o *XLSXOptions) Map() map[string]interface{} {
if o == nil {
return nil
}
opt := map[string]interface{}{}
if o.SheetName != "" {
opt["sheetName"] = o.SheetName
}

return opt
}
55 changes: 54 additions & 1 deletion data_format_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func TestParseFormatConfigMap(t *testing.T) {
}{
{CSVDataFormat, map[string]interface{}{}, &CSVOptions{}, ""},
{JSONDataFormat, map[string]interface{}{}, &JSONOptions{}, ""},
{XLSDataFormat, map[string]interface{}{}, nil, "cannot parse configuration for format: xls"},
{XLSXDataFormat, map[string]interface{}{}, &XLSXOptions{}, ""},
}

for i, c := range cases {
Expand Down Expand Up @@ -136,3 +136,56 @@ func TestJSONOptionsMap(t *testing.T) {
}
}
}

func TestNewXLSXOptions(t *testing.T) {
cases := []struct {
opts map[string]interface{}
res *XLSXOptions
err string
}{
{nil, &XLSXOptions{}, ""},
{map[string]interface{}{}, &XLSXOptions{}, ""},
{map[string]interface{}{"sheetName": "foo"}, &XLSXOptions{SheetName: "foo"}, ""},
{map[string]interface{}{"sheetName": true}, nil, "invalid sheetName value: true"},
}

for i, c := range cases {
got, err := NewXLSXOptions(c.opts)
if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
t.Errorf("case %d error expected: '%s', got: '%s'", i, c.err, err)
continue
}
if c.err == "" {
xlsxo, ok := got.(*XLSXOptions)
if !ok {
t.Errorf("case %d didn't return a CSVOptions pointer", i)
continue
}

if xlsxo.SheetName != c.res.SheetName {
t.Errorf("case %d SheetName expected: %s, got: %s", i, xlsxo.SheetName, c.res.SheetName)
continue
}
}
}
}

func TestXLSXOptionsMap(t *testing.T) {
cases := []struct {
opt *XLSXOptions
res map[string]interface{}
}{
{nil, nil},
{&XLSXOptions{}, map[string]interface{}{}},
{&XLSXOptions{SheetName: "foo"}, map[string]interface{}{"sheetName": "foo"}},
}

for i, c := range cases {
got := c.opt.Map()
for key, val := range c.res {
if got[key] != val {
t.Errorf("case %d, key '%s' expected: '%s' got:'%s'", i, key, val, got[key])
}
}
}
}
11 changes: 6 additions & 5 deletions data_format_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ func TestSupportedDataFormats(t *testing.T) {
CBORDataFormat,
JSONDataFormat,
CSVDataFormat,
XLSXDataFormat,
}

for i, f := range SupportedDataFormats() {
Expand All @@ -28,7 +29,7 @@ func TestDataFormatString(t *testing.T) {
{CSVDataFormat, "csv"},
{JSONDataFormat, "json"},
{XMLDataFormat, "xml"},
{XLSDataFormat, "xls"},
{XLSXDataFormat, "xlsx"},
{CBORDataFormat, "cbor"},
}

Expand All @@ -53,8 +54,8 @@ func TestParseDataFormatString(t *testing.T) {
{"json", JSONDataFormat, ""},
{".xml", XMLDataFormat, ""},
{"xml", XMLDataFormat, ""},
{".xls", XLSDataFormat, ""},
{"xls", XLSDataFormat, ""},
{".xlsx", XLSXDataFormat, ""},
{"xlsx", XLSXDataFormat, ""},
{"cbor", CBORDataFormat, ""},
{".cbor", CBORDataFormat, ""},
}
Expand All @@ -81,7 +82,7 @@ func TestDataFormatMarshalJSON(t *testing.T) {
{CSVDataFormat, []byte(`"csv"`), ""},
{JSONDataFormat, []byte(`"json"`), ""},
{XMLDataFormat, []byte(`"xml"`), ""},
{XLSDataFormat, []byte(`"xls"`), ""},
{XLSXDataFormat, []byte(`"xlsx"`), ""},
{CBORDataFormat, []byte(`"cbor"`), ""},
}
for i, c := range cases {
Expand All @@ -106,7 +107,7 @@ func TestDataFormatUnmarshalJSON(t *testing.T) {
{[]byte(`"csv"`), CSVDataFormat, ""},
{[]byte(`"json"`), JSONDataFormat, ""},
{[]byte(`"xml"`), XMLDataFormat, ""},
{[]byte(`"xls"`), XLSDataFormat, ""},
{[]byte(`"xlsx"`), XLSXDataFormat, ""},
{[]byte(`"cbor"`), CBORDataFormat, ""},
}

Expand Down
4 changes: 2 additions & 2 deletions detect/detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ func ExtensionDataFormat(path string) (format dataset.DataFormat, err error) {
return dataset.CSVDataFormat, nil
case ".xml":
return dataset.XMLDataFormat, nil
case ".xls":
return dataset.XLSDataFormat, nil
case ".xlsx":
return dataset.XLSXDataFormat, nil
case "":
return dataset.UnknownDataFormat, errors.New("no file extension provided")
default:
Expand Down
2 changes: 1 addition & 1 deletion detect/detect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func TestExtensionDataFormat(t *testing.T) {
{"foo/bar/baz.csv", dataset.CSVDataFormat, ""},
{"foo/bar/baz.json", dataset.JSONDataFormat, ""},
{"foo/bar/baz.xml", dataset.XMLDataFormat, ""},
{"foo/bar/baz.xls", dataset.XLSDataFormat, ""},
{"foo/bar/baz.xlsx", dataset.XLSXDataFormat, ""},
{"foo/bar/baz.cbor", dataset.CBORDataFormat, ""},
{"foo/bar/baz", dataset.UnknownDataFormat, "no file extension provided"},
{"foo/bar/baz.jpg", dataset.UnknownDataFormat, "unsupported file type: '.jpg'"},
Expand Down
2 changes: 2 additions & 0 deletions detect/determineFields.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ func Schema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}
return JSONSchema(r, data)
case dataset.CSVDataFormat:
return CSVSchema(r, data)
case dataset.XLSXDataFormat:
return XLSXSchema(r, data)
default:
err = fmt.Errorf("'%s' is not supported for field detection", r.Format)
return
Expand Down
13 changes: 13 additions & 0 deletions detect/xlsx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package detect

import (
"io"

"github.com/qri-io/dataset"
)

// XLSXSchema determines any schema information for an excel spreadsheet
// TODO (b5): currently unimplemented
func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
return dataset.BaseSchemaArray, 0, nil
}
11 changes: 10 additions & 1 deletion dsfs/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ import (

"github.com/libp2p/go-libp2p-crypto"
"github.com/multiformats/go-multihash"
"github.com/qri-io/qfs/cafs"
"github.com/qri-io/dataset"
"github.com/qri-io/dataset/dsio"
"github.com/qri-io/dataset/validate"
"github.com/qri-io/dsdiff"
"github.com/qri-io/qfs"
"github.com/qri-io/qfs/cafs"
)

// LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist,
Expand Down Expand Up @@ -284,6 +284,15 @@ func prepareDataset(store cafs.Filestore, ds, dsPrev *dataset.Dataset, privKey c

cleanTitleAndMessage(&ds.Commit.Title, &ds.Commit.Message, diffDescription)

// TODO (b5): this is a hack until we have a better dataset differ
// "Structure: 2 changes" implies that the underlying bytes that represent the
// data has changed, but the acutal data itself hasn't.
// two elements in structure are byte-sensitive: checksum and length
// we. need. better. diffing. tools.
if ds.Commit.Title == "Structure: 2 changes" {
return "", fmt.Errorf("no meaningful changes detected")
}

ds.Commit.Timestamp = Timestamp()
sb, _ := ds.SignableBytes()
signedBytes, err := privKey.Sign(sb)
Expand Down
8 changes: 8 additions & 0 deletions dsio/cbor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type CBORReader struct {
length int
}

var _ EntryReader = (*CBORReader)(nil)

var (
bigen = binary.BigEndian
)
Expand Down Expand Up @@ -94,6 +96,12 @@ func (r *CBORReader) ReadEntry() (ent Entry, err error) {
return
}

// Close finalizes the reader
func (r *CBORReader) Close() error {
// TODO (b5): check if underlying reader is an io.ReadCloser, call close here if so
return nil
}

const (
cborBdFalse byte = 0xf4 + iota
cborBdTrue
Expand Down
9 changes: 9 additions & 0 deletions dsio/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ type CSVReader struct {
types []string
}

var _ EntryReader = (*CSVReader)(nil)

// NewCSVReader creates a reader from a structure and read source
func NewCSVReader(st *dataset.Structure, r io.Reader) *CSVReader {
// TODO - handle error
Expand Down Expand Up @@ -80,6 +82,13 @@ func (r *CSVReader) ReadEntry() (Entry, error) {
return Entry{Value: value}, nil
}

// Close finalizes the reader
func (r *CSVReader) Close() error {
// TODO (b5): we should retain a reference to the underlying reader &
// check if it's an io.ReadCloser, calling close here if so
return nil
}

// decode uses specified types from structure's schema to cast csv string values to their
// intended types. If casting fails because the data is invalid, it's left as a string instead
// of causing an error.
Expand Down
6 changes: 6 additions & 0 deletions dsio/dsio.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ type EntryReader interface {
Structure() *dataset.Structure
// ReadVal reads one row of structured data from the reader
ReadEntry() (Entry, error)
// Close finalizes the Reader
Close() error
}

// EntryReadWriter combines EntryWriter and EntryReader behaviors
Expand All @@ -54,6 +56,8 @@ func NewEntryReader(st *dataset.Structure, r io.Reader) (EntryReader, error) {
return NewJSONReader(st, r)
case dataset.CSVDataFormat:
return NewCSVReader(st, r), nil
case dataset.XLSXDataFormat:
return NewXLSXReader(st, r)
case dataset.UnknownDataFormat:
err := fmt.Errorf("structure must have a data format")
log.Debug(err.Error())
Expand All @@ -74,6 +78,8 @@ func NewEntryWriter(st *dataset.Structure, w io.Writer) (EntryWriter, error) {
return NewJSONWriter(st, w)
case dataset.CSVDataFormat:
return NewCSVWriter(st, w), nil
case dataset.XLSXDataFormat:
return NewXLSXWriter(st, w)
case dataset.UnknownDataFormat:
err := fmt.Errorf("structure must have a data format")
log.Debug(err.Error())
Expand Down
2 changes: 1 addition & 1 deletion dsio/entry_buffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

// EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa
// Read and Write are replaced with ReadRow and WriteEntry. It's worth noting
// Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting
// that different data formats have idisyncrcies that affect the behavior
// of buffers and their output. For example, EntryBuffer won't write things like
// CSV header rows or enclosing JSON arrays until after the writer's
Expand Down
Loading

0 comments on commit d20b4dc

Please sign in to comment.