diff --git a/velox/connectors/hive/FileHandle.cpp b/velox/connectors/hive/FileHandle.cpp index b092946f1e10..8deabd1f2d72 100644 --- a/velox/connectors/hive/FileHandle.cpp +++ b/velox/connectors/hive/FileHandle.cpp @@ -34,8 +34,7 @@ std::string groupName(const std::string& filename) { } // namespace std::shared_ptr FileHandleGenerator::operator()( - std::tuple - params) { + const std::string& filename) { // We have seen cases where drivers are stuck when creating file handles. // Adding a trace here to spot this more easily in future. process::TraceContext trace("FileHandleGenerator::operator()"); @@ -44,16 +43,11 @@ std::shared_ptr FileHandleGenerator::operator()( { MicrosecondTimer timer(&elapsedTimeUs); fileHandle = std::make_shared(); - filesystems::FileOptions options; - options.values["fileSize"] = std::get<1>(params); - // Add length and modification time here to create a unique handle? - fileHandle->file = - filesystems::getFileSystem(std::get<0>(params), properties_) - ->openFileForRead(std::get<0>(params), options); - fileHandle->uuid = StringIdLease(fileIds(), std::get<0>(params)); - fileHandle->groupId = - StringIdLease(fileIds(), groupName(std::get<0>(params))); - VLOG(1) << "Generating file handle for: " << std::get<0>(params) + fileHandle->file = filesystems::getFileSystem(filename, properties_) + ->openFileForRead(filename); + fileHandle->uuid = StringIdLease(fileIds(), filename); + fileHandle->groupId = StringIdLease(fileIds(), groupName(filename)); + VLOG(1) << "Generating file handle for: " << filename << " uuid: " << fileHandle->uuid.id(); } RECORD_HISTOGRAM_METRIC_VALUE( diff --git a/velox/connectors/hive/FileHandle.h b/velox/connectors/hive/FileHandle.h index 839899daf941..6fb6853d7544 100644 --- a/velox/connectors/hive/FileHandle.h +++ b/velox/connectors/hive/FileHandle.h @@ -62,16 +62,14 @@ class FileHandleGenerator { FileHandleGenerator() {} FileHandleGenerator(std::shared_ptr properties) : properties_(std::move(properties)) {} - std::shared_ptr operator()( - std::tuple - params); + std::shared_ptr operator()(const std::string& filename); private: const std::shared_ptr properties_; }; using FileHandleFactory = CachedFactory< - std::tuple, + std::string, std::shared_ptr, FileHandleGenerator>; diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 94593ca50af8..c693e8ee0922 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -59,12 +59,8 @@ HiveConnector::HiveConnector( hiveConfig_(std::make_shared(config)), fileHandleFactory_( hiveConfig_->isFileHandleCacheEnabled() - ? std::make_unique, - std::shared_ptr>>( + ? std::make_unique< + SimpleLRUCache>>( hiveConfig_->numCacheFileHandles()) : nullptr, std::make_unique(config)), diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 3d47d8ba31dd..072a5f68d54a 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -146,22 +146,7 @@ void SplitReader::prepareSplit( std::shared_ptr fileHandle; try { - auto fileSizeIter = hiveSplit_->infoColumns.find(kFileSize); - auto fileModificationTimeIter = - hiveSplit_->infoColumns.find(kModificationTime); - if (fileSizeIter != hiveSplit_->infoColumns.end() && - fileModificationTimeIter != hiveSplit_->infoColumns.end()) { - fileHandle = fileHandleFactory_ - ->generate(std::make_tuple( - hiveSplit_->filePath, - fileSizeIter->second, - fileModificationTimeIter->second)) - .second; - } else { - fileHandle = fileHandleFactory_ - ->generate(std::make_tuple(hiveSplit_->filePath, "", "")) - .second; - } + fileHandle = fileHandleFactory_->generate(hiveSplit_->filePath).second; } catch (const VeloxRuntimeError& e) { if (e.errorCode() == error_code::kFileNotFound && hiveConfig_->ignoreMissingFiles( diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp index ae68c6595b5c..b87007fb9804 100644 --- a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp @@ -94,8 +94,7 @@ PositionalDeleteFileReader::PositionalDeleteFileReader( deleteSplit_); auto deleteFileHandle = - fileHandleFactory_->generate(make_tuple(deleteFile_.filePath, "", "")) - .second; + fileHandleFactory_->generate(deleteFile_.filePath).second; auto deleteFileInput = createBufferedInput( *deleteFileHandle, deleteReaderOpts, diff --git a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp index df99d5ec417a..ebf0853eab9e 100644 --- a/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/GCSFileSystem.cpp @@ -72,7 +72,12 @@ class GCSReadFile final : public ReadFile { // Gets the length of the file. // Checks if there are any issues reading the file. - void initialize() { + void initialize(const FileOptions& options) { + if (options.values.count("fileSize") > 0) { + length_ = !options.values.at("fileSize").empty() + ? std::stoull(options.values.at("fileSize")) + : -1; + } // Make it a no-op if invoked twice. if (length_ != -1) { return; @@ -308,7 +313,7 @@ std::unique_ptr GCSFileSystem::openFileForRead( const FileOptions& options) { const auto gcspath = gcsPath(path); auto gcsfile = std::make_unique(gcspath, impl_->getClient()); - gcsfile->initialize(); + gcsfile->initialize(options); return gcsfile; } diff --git a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp index a88b71bfa002..ce0bc6b6de63 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/S3FileSystem.cpp @@ -79,7 +79,12 @@ class S3ReadFile final : public ReadFile { // Gets the length of the file. // Checks if there are any issues reading the file. - void initialize() { + void initialize(const FileOptions& options) { + if (options.values.count("fileSize") > 0) { + length_ = !options.values.at("fileSize").empty() + ? std::stoull(options.values.at("fileSize")) + : -1; + } // Make it a no-op if invoked twice. if (length_ != -1) { return; @@ -643,7 +648,7 @@ std::unique_ptr S3FileSystem::openFileForRead( const FileOptions& options) { const auto file = s3Path(path); auto s3file = std::make_unique(file, impl_->s3Client()); - s3file->initialize(); + s3file->initialize(options); return s3file; } diff --git a/velox/connectors/hive/tests/FileHandleTest.cpp b/velox/connectors/hive/tests/FileHandleTest.cpp index 3962b8285bc5..659f0299f9ee 100644 --- a/velox/connectors/hive/tests/FileHandleTest.cpp +++ b/velox/connectors/hive/tests/FileHandleTest.cpp @@ -37,11 +37,10 @@ TEST(FileHandleTest, localFile) { } FileHandleFactory factory( - std::make_unique, - std::shared_ptr>>(1000), + std::make_unique< + SimpleLRUCache>>(1000), std::make_unique()); - auto fileHandle = factory.generate(std::make_tuple(filename, "", "")).second; + auto fileHandle = factory.generate(filename).second; ASSERT_EQ(fileHandle->file->size(), 3); char buffer[3]; ASSERT_EQ(fileHandle->file->pread(0, 3, &buffer), "foo");