Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BulkLoad Job and Co-Run BulkLoad and BulkDump #11828

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion documentation/sphinx/source/bulkdump.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ ManagementAPI provides following interfaces to do the operations:
1. Submit a job: submitBulkDumpJob(BulkDumpState job); // For generating the input job metadata, see the point 4.
2. Clear a job: clearBulkDumpJob();
3. Enable the feature: setBulkDumpMode(int mode); // Set mode = 1 to enable; Set mode = 0 to disable.
4. BulkDump job metadata is generated by newBulkDumpTaskLocalSST(KeyRange range, std::string remoteRoot); // Will include more APIs to generate the metadata as the funcationality expands (sp of functionality).
4. BulkDump job metadata is generated by newBulkDumpJobLocalSST(KeyRange range, std::string remoteRoot); // Will include more APIs to generate the metadata as the funcationality expands (sp of functionality).

Mechanisms
==========
Expand Down
4 changes: 2 additions & 2 deletions fdbcli/BulkDumpCommand.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cstddef>
#include <fmt/core.h>
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/BulkDumping.h"
#include "fdbclient/BulkLoadAndDump.h"
#include "fdbclient/IClientApi.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "flow/Arena.h"
Expand Down Expand Up @@ -108,7 +108,7 @@ ACTOR Future<UID> bulkDumpCommandActor(Reference<IClusterConnectionRecord> clust
}
std::string remoteRoot = tokens[4].toString();
KeyRange range = Standalone(KeyRangeRef(rangeBegin, rangeEnd));
state BulkDumpState bulkDumpJob = newBulkDumpTaskLocalSST(range, remoteRoot);
state BulkDumpState bulkDumpJob = newBulkDumpJobLocalSST(range, remoteRoot);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is good change to do.

wait(submitBulkDumpJob(cx, bulkDumpJob));
return bulkDumpJob.getJobId();

Expand Down
53 changes: 27 additions & 26 deletions fdbcli/BulkLoadCommand.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,39 @@
#include "fdbclient/IClientApi.h"

#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/BulkLoading.h"
#include "fdbclient/BulkLoadAndDump.h"

#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.

namespace fdb_cli {

ACTOR Future<Void> getBulkLoadStateByRange(Database cx,
KeyRange rangeToRead,
size_t countLimit,
Optional<BulkLoadPhase> phase) {
ACTOR Future<Void> getBulkLoadTaskStateByRange(Database cx,
KeyRange rangeToRead,
size_t countLimit,
Optional<BulkLoadTaskPhase> phase) {
try {
std::vector<BulkLoadState> res = wait(getValidBulkLoadTasksWithinRange(cx, rangeToRead, countLimit, phase));
std::vector<BulkLoadTaskState> res = wait(getValidBulkLoadTasksWithinRange(cx, rangeToRead, countLimit, phase));
int64_t finishCount = 0;
int64_t unfinishedCount = 0;
for (const auto& bulkLoadState : res) {
if (bulkLoadState.phase == BulkLoadPhase::Complete) {
fmt::println("[Complete]: {}", bulkLoadState.toString());
for (const auto& bulkLoadTaskState : res) {
if (bulkLoadTaskState.phase == BulkLoadTaskPhase::Complete) {
fmt::println("[Complete]: {}", bulkLoadTaskState.toString());
++finishCount;
} else if (bulkLoadState.phase == BulkLoadPhase::Running) {
fmt::println("[Running]: {}", bulkLoadState.toString());
} else if (bulkLoadTaskState.phase == BulkLoadTaskPhase::Running) {
fmt::println("[Running]: {}", bulkLoadTaskState.toString());
++unfinishedCount;
} else if (bulkLoadState.phase == BulkLoadPhase::Triggered) {
fmt::println("[Triggered]: {}", bulkLoadState.toString());
} else if (bulkLoadTaskState.phase == BulkLoadTaskPhase::Triggered) {
fmt::println("[Triggered]: {}", bulkLoadTaskState.toString());
++unfinishedCount;
} else if (bulkLoadState.phase == BulkLoadPhase::Submitted) {
fmt::println("[Submitted] {}", bulkLoadState.toString());
} else if (bulkLoadTaskState.phase == BulkLoadTaskPhase::Submitted) {
fmt::println("[Submitted] {}", bulkLoadTaskState.toString());
++unfinishedCount;
} else if (bulkLoadState.phase == BulkLoadPhase::Acknowledged) {
fmt::println("[Acknowledge] {}", bulkLoadState.toString());
} else if (bulkLoadTaskState.phase == BulkLoadTaskPhase::Acknowledged) {
fmt::println("[Acknowledge] {}", bulkLoadTaskState.toString());
++finishCount;
} else {
UNREACHABLE();
Expand Down Expand Up @@ -128,7 +129,7 @@ ACTOR Future<UID> bulkLoadCommandActor(Reference<IClusterConnectionRecord> clust
std::string byteSampleFile = tokens[6].toString(); // TODO(BulkLoad): reject if the input bytes sampling file is
// not same as the configuration as FDB cluster
KeyRange range = Standalone(KeyRangeRef(rangeBegin, rangeEnd));
state BulkLoadState bulkLoadTask = newBulkLoadTaskLocalSST(range, folder, dataFile, byteSampleFile);
state BulkLoadTaskState bulkLoadTask = newBulkLoadTaskLocalSST(UID(), range, folder, dataFile, byteSampleFile);
wait(submitBulkLoadTask(cx, bulkLoadTask));
return bulkLoadTask.getTaskId();

Expand All @@ -148,25 +149,25 @@ ACTOR Future<UID> bulkLoadCommandActor(Reference<IClusterConnectionRecord> clust
}
KeyRange range = Standalone(KeyRangeRef(rangeBegin, rangeEnd));
std::string inputPhase = tokens[4].toString();
Optional<BulkLoadPhase> phase;
Optional<BulkLoadTaskPhase> phase;
if (inputPhase == "all") {
phase = Optional<BulkLoadPhase>();
phase = Optional<BulkLoadTaskPhase>();
} else if (inputPhase == "submitted") {
phase = BulkLoadPhase::Submitted;
phase = BulkLoadTaskPhase::Submitted;
} else if (inputPhase == "triggered") {
phase = BulkLoadPhase::Triggered;
phase = BulkLoadTaskPhase::Triggered;
} else if (inputPhase == "running") {
phase = BulkLoadPhase::Running;
phase = BulkLoadTaskPhase::Running;
} else if (inputPhase == "complete") {
phase = BulkLoadPhase::Complete;
phase = BulkLoadTaskPhase::Complete;
} else if (inputPhase == "acknowledged") {
phase = BulkLoadPhase::Acknowledged;
phase = BulkLoadTaskPhase::Acknowledged;
} else {
printUsage(tokens[0]);
return UID();
}
int countLimit = std::stoi(tokens[5].toString());
wait(getBulkLoadStateByRange(cx, range, countLimit, phase));
wait(getBulkLoadTaskStateByRange(cx, range, countLimit, phase));
return UID();

} else {
Expand Down
26 changes: 0 additions & 26 deletions fdbclient/BulkDumping.cpp

This file was deleted.

110 changes: 110 additions & 0 deletions fdbclient/BulkLoadAndDump.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* BulkLoadAndDump.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "fdbclient/BulkLoadAndDump.h"
#include "flow/Platform.h"

std::string stringRemovePrefix(std::string str, const std::string& prefix) {
if (str.compare(0, prefix.length(), prefix) == 0) {
str.erase(0, prefix.length());
} else {
return "";
}
return str;
}

// A revert function of StringRef.toFullHexStringPlain()
Key getKeyFromHexString(const std::string& rawString) {
if (rawString.empty()) {
return Key();
}
std::vector<uint8_t> byteList;
ASSERT((rawString.size() + 1) % 3 == 0);
for (size_t i = 0; i < rawString.size(); i += 3) {
std::string byteString = rawString.substr(i, 2);
uint8_t byte = static_cast<uint8_t>(std::stoul(byteString, nullptr, 16));
byteList.push_back(byte);
ASSERT(i + 2 >= rawString.size() || rawString[i + 2] == ' ');
}
return Standalone(StringRef(byteList.data(), byteList.size()));
}

std::string generateBulkLoadJobManifestFileName(const UID& jobId) {
return jobId.toString() + "-job-manifest.txt";
}

std::pair<BulkLoadFileSet, BulkLoadFileSet> generateBulkLoadFileSetting(Version version,
const std::string& relativeFolder,
const std::string& rootLocal,
const std::string& rootRemote) {
// Generate file names based on data version
const std::string manifestFileName = std::to_string(version) + "-manifest.txt";
const std::string dataFileName = std::to_string(version) + "-data.sst";
const std::string byteSampleFileName = std::to_string(version) + "-sample.sst";
BulkLoadFileSet fileSetLocal(rootLocal, relativeFolder, manifestFileName, dataFileName, byteSampleFileName);
BulkLoadFileSet fileSetRemote(rootRemote, relativeFolder, manifestFileName, dataFileName, byteSampleFileName);
return std::make_pair(fileSetLocal, fileSetRemote);
}

std::string generateBulkLoadJobRoot(const std::string& root, const UID& jobId) {
return joinPath(root, jobId.toString());
}

std::string generateBulkLoadJobManifestFileContent(const std::map<Key, BulkLoadManifest>& manifests) {
std::string root = "";
std::string manifestList;
for (const auto& [beginKey, manifest] : manifests) {
if (root.empty()) {
root = manifest.fileSet.rootPath;
} else {
ASSERT(manifest.fileSet.rootPath == root);
}
manifestList =
manifestList +
BulkDumpJobManifestEntry(manifest.getBeginKey(),
manifest.getEndKey(),
joinPath(manifest.fileSet.relativePath, manifest.fileSet.manifestFileName),
manifest.version,
manifest.bytes)
.toString() +
"\n";
}
std::string head = BulkDumpJobManifestHeader(manifests.size(), root).toString() + "\n";
return head + manifestList;
}

BulkLoadTaskState newBulkLoadTaskLocalSST(UID jobID,
KeyRange range,
std::string folder,
std::string dataFile,
std::string bytesSampleFile) {
std::unordered_set<std::string> dataFiles;
dataFiles.insert(dataFile);
return BulkLoadTaskState(
range, BulkLoadFileType::SST, BulkLoadTransportMethod::CP, folder, dataFiles, bytesSampleFile, jobID);
}

BulkLoadJobState newBulkLoadJobLocalSST(const UID& jobId, const KeyRange& range, const std::string& remoteRoot) {
return BulkLoadJobState(jobId, remoteRoot, range, BulkLoadTransportMethod::CP);
}

BulkDumpState newBulkDumpJobLocalSST(const KeyRange& range, const std::string& remoteRoot) {
return BulkDumpState(range, BulkLoadFileType::SST, BulkLoadTransportMethod::CP, remoteRoot);
}
36 changes: 0 additions & 36 deletions fdbclient/BulkLoading.cpp

This file was deleted.

Loading