qri-io · b5 · Feb 5, 2019 · Feb 4, 2019 · Feb 5, 2019 · dustmop
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -30,6 +30,7 @@ jobs:
             github.com/ghodss/yaml
             github.com/ipfs/go-log
             gopkg.in/yaml.v2
+            github.com/360EntSecGroup-Skylar/excelize
       - run: 
           name: Run Lint Tests
           command: golint ./...

diff --git a/data_format.go b/data_format.go
@@ -30,9 +30,8 @@ const (
 	// XMLDataFormat specifies eXtensible Markup Language-formatted data
 	// currently not supported.
 	XMLDataFormat
-	// XLSDataFormat specifies microsoft excel formatted data
-	// currently not supported.
-	XLSDataFormat
+	// XLSXDataFormat specifies microsoft excel formatted data
+	XLSXDataFormat
 )
 
 // SupportedDataFormats gives a slice of data formats that are
@@ -44,6 +43,7 @@ func SupportedDataFormats() []DataFormat {
 		CBORDataFormat,
 		JSONDataFormat,
 		CSVDataFormat,
+		XLSXDataFormat,
 	}
 }
 
@@ -54,7 +54,7 @@ func (f DataFormat) String() string {
 		CSVDataFormat:     "csv",
 		JSONDataFormat:    "json",
 		XMLDataFormat:     "xml",
-		XLSDataFormat:     "xls",
+		XLSXDataFormat:    "xlsx",
 		CBORDataFormat:    "cbor",
 	}[f]
 
@@ -76,8 +76,8 @@ func ParseDataFormatString(s string) (df DataFormat, err error) {
 		"json":  JSONDataFormat,
 		".xml":  XMLDataFormat,
 		"xml":   XMLDataFormat,
-		".xls":  XLSDataFormat,
-		"xls":   XLSDataFormat,
+		".xlsx": XLSXDataFormat,
+		"xlsx":  XLSXDataFormat,
 		"cbor":  CBORDataFormat,
 		".cbor": CBORDataFormat,
 	}[s]

diff --git a/data_format_config.go b/data_format_config.go
@@ -20,6 +20,8 @@ func ParseFormatConfigMap(f DataFormat, opts map[string]interface{}) (FormatConf
 		return NewCSVOptions(opts)
 	case JSONDataFormat:
 		return NewJSONOptions(opts)
+	case XLSXDataFormat:
+		return NewXLSXOptions(opts)
 	default:
 		return nil, fmt.Errorf("cannot parse configuration for format: %s", f.String())
 	}
@@ -141,3 +143,44 @@ func (o *JSONOptions) Map() map[string]interface{} {
 	}
 	return map[string]interface{}{}
 }
+
+// XLSXOptions specifies configuraiton details for the xlsx file format
+type XLSXOptions struct {
+	SheetName string `json:"sheetName,omitempty"`
+}
+
+// NewXLSXOptions creates a XLSXOptions pointer from a map
+func NewXLSXOptions(opts map[string]interface{}) (FormatConfig, error) {
+	o := &XLSXOptions{}
+	if opts == nil {
+		return o, nil
+	}
+
+	if opts["sheetName"] != nil {
+		if sheetName, ok := opts["sheetName"].(string); ok {
+			o.SheetName = sheetName
+		} else {
+			return nil, fmt.Errorf("invalid sheetName value: %v", opts["sheetName"])
+		}
+	}
+
+	return o, nil
+}
+
+// Format announces the XLSX data format for the FormatConfig interface
+func (*XLSXOptions) Format() DataFormat {
+	return XLSXDataFormat
+}
+
+// Map structures XLSXOptions as a map of string keys to values
+func (o *XLSXOptions) Map() map[string]interface{} {
+	if o == nil {
+		return nil
+	}
+	opt := map[string]interface{}{}
+	if o.SheetName != "" {
+		opt["sheetName"] = o.SheetName
+	}
+
+	return opt
+}
diff --git a/data_format_config_test.go b/data_format_config_test.go
@@ -30,7 +30,7 @@ func TestParseFormatConfigMap(t *testing.T) {
 	}{
 		{CSVDataFormat, map[string]interface{}{}, &CSVOptions{}, ""},
 		{JSONDataFormat, map[string]interface{}{}, &JSONOptions{}, ""},
-		{XLSDataFormat, map[string]interface{}{}, nil, "cannot parse configuration for format: xls"},
+		{XLSXDataFormat, map[string]interface{}{}, &XLSXOptions{}, ""},
 	}
 
 	for i, c := range cases {
@@ -136,3 +136,56 @@ func TestJSONOptionsMap(t *testing.T) {
 		}
 	}
 }
+
+func TestNewXLSXOptions(t *testing.T) {
+	cases := []struct {
+		opts map[string]interface{}
+		res  *XLSXOptions
+		err  string
+	}{
+		{nil, &XLSXOptions{}, ""},
+		{map[string]interface{}{}, &XLSXOptions{}, ""},
+		{map[string]interface{}{"sheetName": "foo"}, &XLSXOptions{SheetName: "foo"}, ""},
+		{map[string]interface{}{"sheetName": true}, nil, "invalid sheetName value: true"},
+	}
+
+	for i, c := range cases {
+		got, err := NewXLSXOptions(c.opts)
+		if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
+			t.Errorf("case %d error expected: '%s', got: '%s'", i, c.err, err)
+			continue
+		}
+		if c.err == "" {
+			xlsxo, ok := got.(*XLSXOptions)
+			if !ok {
+				t.Errorf("case %d didn't return a CSVOptions pointer", i)
+				continue
+			}
+
+			if xlsxo.SheetName != c.res.SheetName {
+				t.Errorf("case %d SheetName expected: %s, got: %s", i, xlsxo.SheetName, c.res.SheetName)
+				continue
+			}
+		}
+	}
+}
+
+func TestXLSXOptionsMap(t *testing.T) {
+	cases := []struct {
+		opt *XLSXOptions
+		res map[string]interface{}
+	}{
+		{nil, nil},
+		{&XLSXOptions{}, map[string]interface{}{}},
+		{&XLSXOptions{SheetName: "foo"}, map[string]interface{}{"sheetName": "foo"}},
+	}
+
+	for i, c := range cases {
+		got := c.opt.Map()
+		for key, val := range c.res {
+			if got[key] != val {
+				t.Errorf("case %d, key '%s' expected: '%s' got:'%s'", i, key, val, got[key])
+			}
+		}
+	}
+}
diff --git a/data_format_test.go b/data_format_test.go
@@ -10,6 +10,7 @@ func TestSupportedDataFormats(t *testing.T) {
 		CBORDataFormat,
 		JSONDataFormat,
 		CSVDataFormat,
+		XLSXDataFormat,
 	}
 
 	for i, f := range SupportedDataFormats() {
@@ -28,7 +29,7 @@ func TestDataFormatString(t *testing.T) {
 		{CSVDataFormat, "csv"},
 		{JSONDataFormat, "json"},
 		{XMLDataFormat, "xml"},
-		{XLSDataFormat, "xls"},
+		{XLSXDataFormat, "xlsx"},
 		{CBORDataFormat, "cbor"},
 	}
 
@@ -53,8 +54,8 @@ func TestParseDataFormatString(t *testing.T) {
 		{"json", JSONDataFormat, ""},
 		{".xml", XMLDataFormat, ""},
 		{"xml", XMLDataFormat, ""},
-		{".xls", XLSDataFormat, ""},
-		{"xls", XLSDataFormat, ""},
+		{".xlsx", XLSXDataFormat, ""},
+		{"xlsx", XLSXDataFormat, ""},
 		{"cbor", CBORDataFormat, ""},
 		{".cbor", CBORDataFormat, ""},
 	}
@@ -81,7 +82,7 @@ func TestDataFormatMarshalJSON(t *testing.T) {
 		{CSVDataFormat, []byte(`"csv"`), ""},
 		{JSONDataFormat, []byte(`"json"`), ""},
 		{XMLDataFormat, []byte(`"xml"`), ""},
-		{XLSDataFormat, []byte(`"xls"`), ""},
+		{XLSXDataFormat, []byte(`"xlsx"`), ""},
 		{CBORDataFormat, []byte(`"cbor"`), ""},
 	}
 	for i, c := range cases {
@@ -106,7 +107,7 @@ func TestDataFormatUnmarshalJSON(t *testing.T) {
 		{[]byte(`"csv"`), CSVDataFormat, ""},
 		{[]byte(`"json"`), JSONDataFormat, ""},
 		{[]byte(`"xml"`), XMLDataFormat, ""},
-		{[]byte(`"xls"`), XLSDataFormat, ""},
+		{[]byte(`"xlsx"`), XLSXDataFormat, ""},
 		{[]byte(`"cbor"`), CBORDataFormat, ""},
 	}
 

diff --git a/detect/detect.go b/detect/detect.go
@@ -59,8 +59,8 @@ func ExtensionDataFormat(path string) (format dataset.DataFormat, err error) {
 		return dataset.CSVDataFormat, nil
 	case ".xml":
 		return dataset.XMLDataFormat, nil
-	case ".xls":
-		return dataset.XLSDataFormat, nil
+	case ".xlsx":
+		return dataset.XLSXDataFormat, nil
 	case "":
 		return dataset.UnknownDataFormat, errors.New("no file extension provided")
 	default:

diff --git a/detect/detect_test.go b/detect/detect_test.go
@@ -85,7 +85,7 @@ func TestExtensionDataFormat(t *testing.T) {
 		{"foo/bar/baz.csv", dataset.CSVDataFormat, ""},
 		{"foo/bar/baz.json", dataset.JSONDataFormat, ""},
 		{"foo/bar/baz.xml", dataset.XMLDataFormat, ""},
-		{"foo/bar/baz.xls", dataset.XLSDataFormat, ""},
+		{"foo/bar/baz.xlsx", dataset.XLSXDataFormat, ""},
 		{"foo/bar/baz.cbor", dataset.CBORDataFormat, ""},
 		{"foo/bar/baz", dataset.UnknownDataFormat, "no file extension provided"},
 		{"foo/bar/baz.jpg", dataset.UnknownDataFormat, "unsupported file type: '.jpg'"},

diff --git a/detect/determineFields.go b/detect/determineFields.go
@@ -34,6 +34,8 @@ func Schema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}
 		return JSONSchema(r, data)
 	case dataset.CSVDataFormat:
 		return CSVSchema(r, data)
+	case dataset.XLSXDataFormat:
+		return XLSXSchema(r, data)
 	default:
 		err = fmt.Errorf("'%s' is not supported for field detection", r.Format)
 		return

diff --git a/detect/xlsx.go b/detect/xlsx.go
@@ -0,0 +1,13 @@
+package detect
+
+import (
+	"io"
+
+	"github.com/qri-io/dataset"
+)
+
+// XLSXSchema determines any schema information for an excel spreadsheet
+// TODO (b5): currently unimplemented
+func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
+	return dataset.BaseSchemaArray, 0, nil
+}
diff --git a/dsfs/dataset.go b/dsfs/dataset.go
@@ -12,12 +12,12 @@ import (
 
 	"github.com/libp2p/go-libp2p-crypto"
 	"github.com/multiformats/go-multihash"
-	"github.com/qri-io/qfs/cafs"
 	"github.com/qri-io/dataset"
 	"github.com/qri-io/dataset/dsio"
 	"github.com/qri-io/dataset/validate"
 	"github.com/qri-io/dsdiff"
 	"github.com/qri-io/qfs"
+	"github.com/qri-io/qfs/cafs"
 )
 
 // LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist,
@@ -284,6 +284,15 @@ func prepareDataset(store cafs.Filestore, ds, dsPrev *dataset.Dataset, privKey c
 
 	cleanTitleAndMessage(&ds.Commit.Title, &ds.Commit.Message, diffDescription)
 
+	// TODO (b5): this is a hack until we have a better dataset differ
+	// "Structure: 2 changes" implies that the underlying bytes that represent the
+	// data has changed, but the acutal data itself hasn't.
+	// two elements in structure are byte-sensitive: checksum and length
+	// we. need. better. diffing. tools.
+	if ds.Commit.Title == "Structure: 2 changes" {
+		return "", fmt.Errorf("no meaningful changes detected")
+	}
+
 	ds.Commit.Timestamp = Timestamp()
 	sb, _ := ds.SignableBytes()
 	signedBytes, err := privKey.Sign(sb)

diff --git a/dsio/cbor.go b/dsio/cbor.go
@@ -21,6 +21,8 @@ type CBORReader struct {
 	length   int
 }
 
+var _ EntryReader = (*CBORReader)(nil)
+
 var (
 	bigen = binary.BigEndian
 )
@@ -94,6 +96,12 @@ func (r *CBORReader) ReadEntry() (ent Entry, err error) {
 	return
 }
 
+// Close finalizes the reader
+func (r *CBORReader) Close() error {
+	// TODO (b5): check if underlying reader is an io.ReadCloser, call close here if so
+	return nil
+}
+
 const (
 	cborBdFalse byte = 0xf4 + iota
 	cborBdTrue

diff --git a/dsio/csv.go b/dsio/csv.go
@@ -20,6 +20,8 @@ type CSVReader struct {
 	types      []string
 }
 
+var _ EntryReader = (*CSVReader)(nil)
+
 // NewCSVReader creates a reader from a structure and read source
 func NewCSVReader(st *dataset.Structure, r io.Reader) *CSVReader {
 	// TODO - handle error
@@ -80,6 +82,13 @@ func (r *CSVReader) ReadEntry() (Entry, error) {
 	return Entry{Value: value}, nil
 }
 
+// Close finalizes the reader
+func (r *CSVReader) Close() error {
+	// TODO (b5): we should retain a reference to the underlying reader &
+	// check if it's an io.ReadCloser, calling close here if so
+	return nil
+}
+
 // decode uses specified types from structure's schema to cast csv string values to their
 // intended types. If casting fails because the data is invalid, it's left as a string instead
 // of causing an error.

diff --git a/dsio/dsio.go b/dsio/dsio.go
@@ -28,6 +28,8 @@ type EntryReader interface {
 	Structure() *dataset.Structure
 	// ReadVal reads one row of structured data from the reader
 	ReadEntry() (Entry, error)
+	// Close finalizes the Reader
+	Close() error
 }
 
 // EntryReadWriter combines EntryWriter and EntryReader behaviors
@@ -54,6 +56,8 @@ func NewEntryReader(st *dataset.Structure, r io.Reader) (EntryReader, error) {
 		return NewJSONReader(st, r)
 	case dataset.CSVDataFormat:
 		return NewCSVReader(st, r), nil
+	case dataset.XLSXDataFormat:
+		return NewXLSXReader(st, r)
 	case dataset.UnknownDataFormat:
 		err := fmt.Errorf("structure must have a data format")
 		log.Debug(err.Error())
@@ -74,6 +78,8 @@ func NewEntryWriter(st *dataset.Structure, w io.Writer) (EntryWriter, error) {
 		return NewJSONWriter(st, w)
 	case dataset.CSVDataFormat:
 		return NewCSVWriter(st, w), nil
+	case dataset.XLSXDataFormat:
+		return NewXLSXWriter(st, w)
 	case dataset.UnknownDataFormat:
 		err := fmt.Errorf("structure must have a data format")
 		log.Debug(err.Error())

diff --git a/dsio/entry_buffer.go b/dsio/entry_buffer.go
@@ -7,7 +7,7 @@ import (
 )
 
 // EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa
-// Read and Write are replaced with ReadRow and WriteEntry. It's worth noting
+// Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting
 // that different data formats have idisyncrcies that affect the behavior
 // of buffers and their output. For example, EntryBuffer won't write things like
 // CSV header rows or enclosing JSON arrays until after the writer's