Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(xlsx): initial support for xlsx data format, identity dsio #170

Merged
merged 2 commits into from
Feb 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
github.com/ghodss/yaml
github.com/ipfs/go-log
gopkg.in/yaml.v2
github.com/360EntSecGroup-Skylar/excelize
- run:
name: Run Lint Tests
command: golint ./...
Expand Down
12 changes: 6 additions & 6 deletions data_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ const (
// XMLDataFormat specifies eXtensible Markup Language-formatted data
// currently not supported.
XMLDataFormat
// XLSDataFormat specifies microsoft excel formatted data
// currently not supported.
XLSDataFormat
// XLSXDataFormat specifies microsoft excel formatted data
XLSXDataFormat
)

// SupportedDataFormats gives a slice of data formats that are
Expand All @@ -44,6 +43,7 @@ func SupportedDataFormats() []DataFormat {
CBORDataFormat,
JSONDataFormat,
CSVDataFormat,
XLSXDataFormat,
}
}

Expand All @@ -54,7 +54,7 @@ func (f DataFormat) String() string {
CSVDataFormat: "csv",
JSONDataFormat: "json",
XMLDataFormat: "xml",
XLSDataFormat: "xls",
XLSXDataFormat: "xlsx",
CBORDataFormat: "cbor",
}[f]

Expand All @@ -76,8 +76,8 @@ func ParseDataFormatString(s string) (df DataFormat, err error) {
"json": JSONDataFormat,
".xml": XMLDataFormat,
"xml": XMLDataFormat,
".xls": XLSDataFormat,
"xls": XLSDataFormat,
".xlsx": XLSXDataFormat,
"xlsx": XLSXDataFormat,
"cbor": CBORDataFormat,
".cbor": CBORDataFormat,
}[s]
Expand Down
43 changes: 43 additions & 0 deletions data_format_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ func ParseFormatConfigMap(f DataFormat, opts map[string]interface{}) (FormatConf
return NewCSVOptions(opts)
case JSONDataFormat:
return NewJSONOptions(opts)
case XLSXDataFormat:
return NewXLSXOptions(opts)
default:
return nil, fmt.Errorf("cannot parse configuration for format: %s", f.String())
}
Expand Down Expand Up @@ -141,3 +143,44 @@ func (o *JSONOptions) Map() map[string]interface{} {
}
return map[string]interface{}{}
}

// XLSXOptions specifies configuraiton details for the xlsx file format
type XLSXOptions struct {
SheetName string `json:"sheetName,omitempty"`
}

// NewXLSXOptions creates a XLSXOptions pointer from a map
func NewXLSXOptions(opts map[string]interface{}) (FormatConfig, error) {
o := &XLSXOptions{}
if opts == nil {
return o, nil
}

if opts["sheetName"] != nil {
if sheetName, ok := opts["sheetName"].(string); ok {
o.SheetName = sheetName
} else {
return nil, fmt.Errorf("invalid sheetName value: %v", opts["sheetName"])
}
}

return o, nil
}

// Format announces the XLSX data format for the FormatConfig interface
func (*XLSXOptions) Format() DataFormat {
return XLSXDataFormat
}

// Map structures XLSXOptions as a map of string keys to values
func (o *XLSXOptions) Map() map[string]interface{} {
if o == nil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check nil here? In the rest of codebase, we're assuming that interface types won't contains a nil pointer value, for example see lib/datasets.go. Seems like an anti-pattern to support it here, and the code will just crash a few lines below where we try to resolve o.SheetName.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally agreed with this. All three FormatConfig variants have this quirk, so I'd like to remove it in a separate PR with some proper checks. I'll file an issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return nil
}
opt := map[string]interface{}{}
if o.SheetName != "" {
opt["sheetName"] = o.SheetName
}

return opt
}
55 changes: 54 additions & 1 deletion data_format_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func TestParseFormatConfigMap(t *testing.T) {
}{
{CSVDataFormat, map[string]interface{}{}, &CSVOptions{}, ""},
{JSONDataFormat, map[string]interface{}{}, &JSONOptions{}, ""},
{XLSDataFormat, map[string]interface{}{}, nil, "cannot parse configuration for format: xls"},
{XLSXDataFormat, map[string]interface{}{}, &XLSXOptions{}, ""},
}

for i, c := range cases {
Expand Down Expand Up @@ -136,3 +136,56 @@ func TestJSONOptionsMap(t *testing.T) {
}
}
}

func TestNewXLSXOptions(t *testing.T) {
cases := []struct {
opts map[string]interface{}
res *XLSXOptions
err string
}{
{nil, &XLSXOptions{}, ""},
{map[string]interface{}{}, &XLSXOptions{}, ""},
{map[string]interface{}{"sheetName": "foo"}, &XLSXOptions{SheetName: "foo"}, ""},
{map[string]interface{}{"sheetName": true}, nil, "invalid sheetName value: true"},
}

for i, c := range cases {
got, err := NewXLSXOptions(c.opts)
if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
t.Errorf("case %d error expected: '%s', got: '%s'", i, c.err, err)
continue
}
if c.err == "" {
xlsxo, ok := got.(*XLSXOptions)
if !ok {
t.Errorf("case %d didn't return a CSVOptions pointer", i)
continue
}

if xlsxo.SheetName != c.res.SheetName {
t.Errorf("case %d SheetName expected: %s, got: %s", i, xlsxo.SheetName, c.res.SheetName)
continue
}
}
}
}

func TestXLSXOptionsMap(t *testing.T) {
cases := []struct {
opt *XLSXOptions
res map[string]interface{}
}{
{nil, nil},
{&XLSXOptions{}, map[string]interface{}{}},
{&XLSXOptions{SheetName: "foo"}, map[string]interface{}{"sheetName": "foo"}},
}

for i, c := range cases {
got := c.opt.Map()
for key, val := range c.res {
if got[key] != val {
t.Errorf("case %d, key '%s' expected: '%s' got:'%s'", i, key, val, got[key])
}
}
}
}
11 changes: 6 additions & 5 deletions data_format_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ func TestSupportedDataFormats(t *testing.T) {
CBORDataFormat,
JSONDataFormat,
CSVDataFormat,
XLSXDataFormat,
}

for i, f := range SupportedDataFormats() {
Expand All @@ -28,7 +29,7 @@ func TestDataFormatString(t *testing.T) {
{CSVDataFormat, "csv"},
{JSONDataFormat, "json"},
{XMLDataFormat, "xml"},
{XLSDataFormat, "xls"},
{XLSXDataFormat, "xlsx"},
{CBORDataFormat, "cbor"},
}

Expand All @@ -53,8 +54,8 @@ func TestParseDataFormatString(t *testing.T) {
{"json", JSONDataFormat, ""},
{".xml", XMLDataFormat, ""},
{"xml", XMLDataFormat, ""},
{".xls", XLSDataFormat, ""},
{"xls", XLSDataFormat, ""},
{".xlsx", XLSXDataFormat, ""},
{"xlsx", XLSXDataFormat, ""},
{"cbor", CBORDataFormat, ""},
{".cbor", CBORDataFormat, ""},
}
Expand All @@ -81,7 +82,7 @@ func TestDataFormatMarshalJSON(t *testing.T) {
{CSVDataFormat, []byte(`"csv"`), ""},
{JSONDataFormat, []byte(`"json"`), ""},
{XMLDataFormat, []byte(`"xml"`), ""},
{XLSDataFormat, []byte(`"xls"`), ""},
{XLSXDataFormat, []byte(`"xlsx"`), ""},
{CBORDataFormat, []byte(`"cbor"`), ""},
}
for i, c := range cases {
Expand All @@ -106,7 +107,7 @@ func TestDataFormatUnmarshalJSON(t *testing.T) {
{[]byte(`"csv"`), CSVDataFormat, ""},
{[]byte(`"json"`), JSONDataFormat, ""},
{[]byte(`"xml"`), XMLDataFormat, ""},
{[]byte(`"xls"`), XLSDataFormat, ""},
{[]byte(`"xlsx"`), XLSXDataFormat, ""},
{[]byte(`"cbor"`), CBORDataFormat, ""},
}

Expand Down
4 changes: 2 additions & 2 deletions detect/detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ func ExtensionDataFormat(path string) (format dataset.DataFormat, err error) {
return dataset.CSVDataFormat, nil
case ".xml":
return dataset.XMLDataFormat, nil
case ".xls":
return dataset.XLSDataFormat, nil
case ".xlsx":
return dataset.XLSXDataFormat, nil
case "":
return dataset.UnknownDataFormat, errors.New("no file extension provided")
default:
Expand Down
2 changes: 1 addition & 1 deletion detect/detect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func TestExtensionDataFormat(t *testing.T) {
{"foo/bar/baz.csv", dataset.CSVDataFormat, ""},
{"foo/bar/baz.json", dataset.JSONDataFormat, ""},
{"foo/bar/baz.xml", dataset.XMLDataFormat, ""},
{"foo/bar/baz.xls", dataset.XLSDataFormat, ""},
{"foo/bar/baz.xlsx", dataset.XLSXDataFormat, ""},
{"foo/bar/baz.cbor", dataset.CBORDataFormat, ""},
{"foo/bar/baz", dataset.UnknownDataFormat, "no file extension provided"},
{"foo/bar/baz.jpg", dataset.UnknownDataFormat, "unsupported file type: '.jpg'"},
Expand Down
2 changes: 2 additions & 0 deletions detect/determineFields.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ func Schema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}
return JSONSchema(r, data)
case dataset.CSVDataFormat:
return CSVSchema(r, data)
case dataset.XLSXDataFormat:
return XLSXSchema(r, data)
default:
err = fmt.Errorf("'%s' is not supported for field detection", r.Format)
return
Expand Down
13 changes: 13 additions & 0 deletions detect/xlsx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package detect

import (
"io"

"github.com/qri-io/dataset"
)

// XLSXSchema determines any schema information for an excel spreadsheet
// TODO (b5): currently unimplemented
func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
return dataset.BaseSchemaArray, 0, nil
}
11 changes: 10 additions & 1 deletion dsfs/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ import (

"github.com/libp2p/go-libp2p-crypto"
"github.com/multiformats/go-multihash"
"github.com/qri-io/qfs/cafs"
"github.com/qri-io/dataset"
"github.com/qri-io/dataset/dsio"
"github.com/qri-io/dataset/validate"
"github.com/qri-io/dsdiff"
"github.com/qri-io/qfs"
"github.com/qri-io/qfs/cafs"
)

// LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist,
Expand Down Expand Up @@ -284,6 +284,15 @@ func prepareDataset(store cafs.Filestore, ds, dsPrev *dataset.Dataset, privKey c

cleanTitleAndMessage(&ds.Commit.Title, &ds.Commit.Message, diffDescription)

// TODO (b5): this is a hack until we have a better dataset differ
// "Structure: 2 changes" implies that the underlying bytes that represent the
// data has changed, but the acutal data itself hasn't.
// two elements in structure are byte-sensitive: checksum and length
// we. need. better. diffing. tools.
if ds.Commit.Title == "Structure: 2 changes" {
return "", fmt.Errorf("no meaningful changes detected")
}

ds.Commit.Timestamp = Timestamp()
sb, _ := ds.SignableBytes()
signedBytes, err := privKey.Sign(sb)
Expand Down
8 changes: 8 additions & 0 deletions dsio/cbor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type CBORReader struct {
length int
}

var _ EntryReader = (*CBORReader)(nil)

var (
bigen = binary.BigEndian
)
Expand Down Expand Up @@ -94,6 +96,12 @@ func (r *CBORReader) ReadEntry() (ent Entry, err error) {
return
}

// Close finalizes the reader
func (r *CBORReader) Close() error {
// TODO (b5): check if underlying reader is an io.ReadCloser, call close here if so
return nil
}

const (
cborBdFalse byte = 0xf4 + iota
cborBdTrue
Expand Down
9 changes: 9 additions & 0 deletions dsio/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ type CSVReader struct {
types []string
}

var _ EntryReader = (*CSVReader)(nil)

// NewCSVReader creates a reader from a structure and read source
func NewCSVReader(st *dataset.Structure, r io.Reader) *CSVReader {
// TODO - handle error
Expand Down Expand Up @@ -80,6 +82,13 @@ func (r *CSVReader) ReadEntry() (Entry, error) {
return Entry{Value: value}, nil
}

// Close finalizes the reader
func (r *CSVReader) Close() error {
// TODO (b5): we should retain a reference to the underlying reader &
// check if it's an io.ReadCloser, calling close here if so
return nil
}

// decode uses specified types from structure's schema to cast csv string values to their
// intended types. If casting fails because the data is invalid, it's left as a string instead
// of causing an error.
Expand Down
6 changes: 6 additions & 0 deletions dsio/dsio.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ type EntryReader interface {
Structure() *dataset.Structure
// ReadVal reads one row of structured data from the reader
ReadEntry() (Entry, error)
// Close finalizes the Reader
Close() error
}

// EntryReadWriter combines EntryWriter and EntryReader behaviors
Expand All @@ -54,6 +56,8 @@ func NewEntryReader(st *dataset.Structure, r io.Reader) (EntryReader, error) {
return NewJSONReader(st, r)
case dataset.CSVDataFormat:
return NewCSVReader(st, r), nil
case dataset.XLSXDataFormat:
return NewXLSXReader(st, r)
case dataset.UnknownDataFormat:
err := fmt.Errorf("structure must have a data format")
log.Debug(err.Error())
Expand All @@ -74,6 +78,8 @@ func NewEntryWriter(st *dataset.Structure, w io.Writer) (EntryWriter, error) {
return NewJSONWriter(st, w)
case dataset.CSVDataFormat:
return NewCSVWriter(st, w), nil
case dataset.XLSXDataFormat:
return NewXLSXWriter(st, w)
case dataset.UnknownDataFormat:
err := fmt.Errorf("structure must have a data format")
log.Debug(err.Error())
Expand Down
2 changes: 1 addition & 1 deletion dsio/entry_buffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

// EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa
// Read and Write are replaced with ReadRow and WriteEntry. It's worth noting
// Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting
// that different data formats have idisyncrcies that affect the behavior
// of buffers and their output. For example, EntryBuffer won't write things like
// CSV header rows or enclosing JSON arrays until after the writer's
Expand Down
Loading