TerraSync: switch to per-directory hash caching

Avoid a central hash cache becoming enormous, now we use the selective
download scheme for the tile dirs.

Hash name is changed to co-exist with older versions.
This commit is contained in:
James Turner 2020-10-30 22:49:38 +00:00
parent 990c53fc9c
commit 8d6d671cc1
2 changed files with 205 additions and 188 deletions

View File

@ -81,6 +81,45 @@ std::string innerResultCodeAsString(HTTPRepository::ResultCode code) {
return "Unknown response code";
}
struct HashCacheEntry {
std::string filePath;
time_t modTime;
size_t lengthBytes;
std::string hashHex;
};
using HashCache = std::unordered_map<std::string, HashCacheEntry>;
std::string computeHashForPath(const SGPath& p)
{
if (!p.exists())
return {};
sha1nfo info;
sha1_init(&info);
const int bufSize = 1024 * 1024;
char* buf = static_cast<char*>(malloc(bufSize));
if (!buf) {
sg_io_exception("Couldn't allocate SHA1 computation buffer");
}
size_t readLen;
SGBinaryFile f(p);
if (!f.open(SG_IO_IN)) {
free(buf);
throw sg_io_exception("Couldn't open file for compute hash", p);
}
while ((readLen = f.read(buf, bufSize)) > 0) {
sha1_write(&info, buf, readLen);
}
f.close();
free(buf);
std::string hashBytes((char*)sha1_result(&info), HASH_LENGTH);
return strutils::encodeHex(hashBytes);
}
} // namespace
class HTTPDirectory
@ -112,6 +151,9 @@ class HTTPDirectory
typedef std::vector<ChildInfo> ChildInfoList;
ChildInfoList children;
mutable HashCache hashes;
mutable bool hashCacheDirty = false;
public:
HTTPDirectory(HTTPRepoPrivate* repo, const std::string& path) :
_repository(repo),
@ -125,6 +167,8 @@ public:
// already exists on disk
parseDirIndex(children);
std::sort(children.begin(), children.end());
parseHashCache();
} catch (sg_exception& ) {
// parsing cache failed
children.clear();
@ -150,7 +194,7 @@ public:
{
SGPath fpath(absolutePath());
fpath.append(".dirindex");
_repository->updatedFileContents(fpath, hash);
updatedFileContents(fpath, hash);
children.clear();
parseDirIndex(children);
@ -179,20 +223,20 @@ public:
size_t bufSize = 0;
for (const auto& child : children) {
if (child.type != HTTPRepository::FileType)
continue;
if (child.type != HTTPRepository::FileType)
continue;
if (child.path.exists())
continue;
if (child.path.exists())
continue;
SGPath cp = _repository->installedCopyPath;
cp.append(relativePath());
cp.append(child.name);
if (!cp.exists()) {
continue;
}
SGPath cp = _repository->installedCopyPath;
cp.append(relativePath());
cp.append(child.name);
if (!cp.exists()) {
continue;
}
SGBinaryFile src(cp);
SGBinaryFile src(cp);
SGBinaryFile dst(child.path);
src.open(SG_IO_IN);
dst.open(SG_IO_OUT);
@ -388,7 +432,7 @@ public:
_relativePath + "/" + file,
HTTPRepository::REPO_ERROR_CHECKSUM);
} else {
_repository->updatedFileContents(it->path, hash);
updatedFileContents(it->path, hash);
_repository->updatedChildSuccessfully(_relativePath + "/" +
file);
@ -453,6 +497,50 @@ public:
fpath.append(file);
_repository->failedToUpdateChild(fpath, status);
}
std::string hashForPath(const SGPath& p) const
{
const auto ps = p.utf8Str();
auto it = hashes.find(ps);
if (it != hashes.end()) {
const auto& entry = it->second;
// ensure data on disk hasn't changed.
// we could also use the file type here if we were paranoid
if ((p.sizeInBytes() == entry.lengthBytes) && (p.modTime() == entry.modTime)) {
return entry.hashHex;
}
// entry in the cache, but it's stale so remove and fall through
hashes.erase(it);
}
std::string hash = computeHashForPath(p);
updatedFileContents(p, hash);
return hash;
}
bool isHashCacheDirty() const
{
return hashCacheDirty;
}
void writeHashCache() const
{
if (!hashCacheDirty)
return;
hashCacheDirty = false;
SGPath cachePath = absolutePath() / ".dirhash";
sg_ofstream stream(cachePath, std::ios::out | std::ios::trunc | std::ios::binary);
for (const auto& e : hashes) {
const auto& entry = e.second;
stream << entry.filePath << "*" << entry.modTime << "*"
<< entry.lengthBytes << "*" << entry.hashHex << "\n";
}
stream.close();
}
private:
struct ChildWithName
@ -576,7 +664,7 @@ private:
ok = _repository->deleteDirectory(fpath, path);
} else {
// remove the hash cache entry
_repository->updatedFileContents(path, std::string());
updatedFileContents(path, std::string());
ok = path.remove();
}
@ -594,9 +682,80 @@ private:
if (child.type == HTTPRepository::TarballType)
p.concat(
".tgz"); // For tarballs the hash is against the tarball file itself
return _repository->hashForPath(p);
return hashForPath(p);
}
void parseHashCache()
{
hashes.clear();
SGPath cachePath = absolutePath() / ".dirhash";
if (!cachePath.exists()) {
return;
}
sg_ifstream stream(cachePath, std::ios::in);
while (!stream.eof()) {
std::string line;
std::getline(stream, line);
line = simgear::strutils::strip(line);
if (line.empty() || line[0] == '#')
continue;
string_list tokens = simgear::strutils::split(line, "*");
if (tokens.size() < 4) {
SG_LOG(SG_TERRASYNC, SG_WARN, "invalid entry in '" << cachePath << "': '" << line << "' (ignoring line)");
continue;
}
const std::string nameData = simgear::strutils::strip(tokens[0]);
const std::string timeData = simgear::strutils::strip(tokens[1]);
const std::string sizeData = simgear::strutils::strip(tokens[2]);
const std::string hashData = simgear::strutils::strip(tokens[3]);
if (nameData.empty() || timeData.empty() || sizeData.empty() || hashData.empty()) {
SG_LOG(SG_TERRASYNC, SG_WARN, "invalid entry in '" << cachePath << "': '" << line << "' (ignoring line)");
continue;
}
HashCacheEntry entry;
entry.filePath = nameData;
entry.hashHex = hashData;
entry.modTime = strtol(timeData.c_str(), NULL, 10);
entry.lengthBytes = strtol(sizeData.c_str(), NULL, 10);
hashes.insert(std::make_pair(entry.filePath, entry));
}
}
void updatedFileContents(const SGPath& p, const std::string& newHash) const
{
// remove the existing entry
const auto ps = p.utf8Str();
auto it = hashes.find(ps);
if (it != hashes.end()) {
hashes.erase(it);
hashCacheDirty = true;
}
if (newHash.empty()) {
return; // we're done
}
// use a cloned SGPath and reset its caching to force one stat() call
SGPath p2(p);
p2.set_cached(false);
p2.set_cached(true);
HashCacheEntry entry;
entry.filePath = ps;
entry.hashHex = newHash;
entry.modTime = p2.modTime();
entry.lengthBytes = p2.sizeInBytes();
hashes.insert(std::make_pair(ps, entry));
hashCacheDirty = true;
}
HTTPRepoPrivate* _repository;
std::string _relativePath; // in URL and file-system space
};
@ -607,7 +766,6 @@ HTTPRepository::HTTPRepository(const SGPath& base, HTTP::Client *cl) :
_d->http = cl;
_d->basePath = base;
_d->rootDir.reset(new HTTPDirectory(_d.get(), ""));
_d->parseHashCache();
}
HTTPRepository::~HTTPRepository()
@ -678,7 +836,6 @@ void HTTPRepoPrivate::checkForComplete()
{
if (pendingUpdateOfChildren.empty() && activeRequests.empty() && queuedRequests.empty()) {
isUpdating = false;
writeHashCache();
}
}
@ -896,7 +1053,7 @@ HTTPRepository::failure() const
return;
}
std::string curHash = _directory->repository()->hashForPath(path());
std::string curHash = _directory->hashForPath(path());
if (hash != curHash) {
simgear::Dir d(_directory->absolutePath());
if (!d.exists()) {
@ -995,6 +1152,7 @@ HTTPRepository::failure() const
http->cancelRequest(*rq, "Repository object deleted");
}
flushHashCaches();
directories.clear(); // wil delete them all
}
@ -1014,147 +1172,6 @@ HTTPRepository::failure() const
return r;
}
std::string HTTPRepoPrivate::hashForPath(const SGPath& p)
{
const auto ps = p.utf8Str();
auto it = hashes.find(ps);
if (it != hashes.end()) {
const auto& entry = it->second;
// ensure data on disk hasn't changed.
// we could also use the file type here if we were paranoid
if ((p.sizeInBytes() == entry.lengthBytes) && (p.modTime() == entry.modTime)) {
return entry.hashHex;
}
// entry in the cache, but it's stale so remove and fall through
hashes.erase(it);
}
std::string hash = computeHashForPath(p);
updatedFileContents(p, hash);
return hash;
}
std::string HTTPRepoPrivate::computeHashForPath(const SGPath& p)
{
if (!p.exists())
return {};
sha1nfo info;
sha1_init(&info);
const int bufSize = 1024 * 1024;
char* buf = static_cast<char*>(malloc(bufSize));
if (!buf) {
sg_io_exception("Couldn't allocate SHA1 computation buffer");
}
size_t readLen;
SGBinaryFile f(p);
if (!f.open(SG_IO_IN)) {
free(buf);
throw sg_io_exception("Couldn't open file for compute hash", p);
}
while ((readLen = f.read(buf, bufSize)) > 0) {
sha1_write(&info, buf, readLen);
}
f.close();
free(buf);
std::string hashBytes((char*) sha1_result(&info), HASH_LENGTH);
return strutils::encodeHex(hashBytes);
}
void HTTPRepoPrivate::updatedFileContents(const SGPath& p, const std::string& newHash)
{
// remove the existing entry
const auto ps = p.utf8Str();
auto it = hashes.find(ps);
if (it != hashes.end()) {
hashes.erase(it);
++hashCacheDirty;
}
if (newHash.empty()) {
return; // we're done
}
// use a cloned SGPath and reset its caching to force one stat() call
SGPath p2(p);
p2.set_cached(false);
p2.set_cached(true);
HashCacheEntry entry;
entry.filePath = ps;
entry.hashHex = newHash;
entry.modTime = p2.modTime();
entry.lengthBytes = p2.sizeInBytes();
hashes.insert(std::make_pair(ps, entry));
++hashCacheDirty ;
}
void HTTPRepoPrivate::writeHashCache()
{
if (hashCacheDirty == 0) {
return;
}
SGPath cachePath = basePath;
cachePath.append(".hashes");
sg_ofstream stream(cachePath, std::ios::out | std::ios::trunc | std::ios::binary);
for (const auto& e : hashes) {
const auto& entry = e.second;
stream << entry.filePath << "*" << entry.modTime << "*"
<< entry.lengthBytes << "*" << entry.hashHex << "\n";
}
stream.close();
hashCacheDirty = 0;
}
void HTTPRepoPrivate::parseHashCache()
{
hashes.clear();
SGPath cachePath = basePath;
cachePath.append(".hashes");
if (!cachePath.exists()) {
return;
}
sg_ifstream stream(cachePath, std::ios::in);
while (!stream.eof()) {
std::string line;
std::getline(stream,line);
line = simgear::strutils::strip(line);
if( line.empty() || line[0] == '#' )
continue;
string_list tokens = simgear::strutils::split(line, "*");
if( tokens.size() < 4 ) {
SG_LOG(SG_TERRASYNC, SG_WARN, "invalid entry in '" << cachePath << "': '" << line << "' (ignoring line)");
continue;
}
const std::string nameData = simgear::strutils::strip(tokens[0]);
const std::string timeData = simgear::strutils::strip(tokens[1]);
const std::string sizeData = simgear::strutils::strip(tokens[2]);
const std::string hashData = simgear::strutils::strip(tokens[3]);
if (nameData.empty() || timeData.empty() || sizeData.empty() || hashData.empty() ) {
SG_LOG(SG_TERRASYNC, SG_WARN, "invalid entry in '" << cachePath << "': '" << line << "' (ignoring line)");
continue;
}
HashCacheEntry entry;
entry.filePath = nameData;
entry.hashHex = hashData;
entry.modTime = strtol(timeData.c_str(), NULL, 10);
entry.lengthBytes = strtol(sizeData.c_str(), NULL, 10);
hashes.insert(std::make_pair(entry.filePath, entry));
}
}
class DirectoryWithPath
{
public:
@ -1184,16 +1201,12 @@ HTTPRepository::failure() const
if (it != directories.end()) {
assert((*it)->absolutePath() == absPath);
directories.erase(it);
} else {
// we encounter this code path when deleting an orphaned directory
}
} else {
// we encounter this code path when deleting an orphaned directory
}
Dir dir(absPath);
Dir dir(absPath);
bool result = dir.remove(true);
// update the hash cache too
updatedFileContents(absPath, std::string());
return result;
}
@ -1229,13 +1242,9 @@ HTTPRepository::failure() const
http->makeRequest(rr);
}
// rate limit how often we write this, since otherwise
// it dominates the time on Windows. 256 seems about right,
// causes a write a few times a minute.
if (hashCacheDirty > 256) {
writeHashCache();
if (countDirtyHashCaches() > 32) {
flushHashCaches();
}
checkForComplete();
}
@ -1307,4 +1316,24 @@ HTTPRepository::failure() const
pendingUpdateOfChildren.push_back(dir);
}
int HTTPRepoPrivate::countDirtyHashCaches() const
{
int result = rootDir->isHashCacheDirty() ? 1 : 0;
for (const auto& dir : directories) {
if (dir->isHashCacheDirty()) {
++result;
}
}
return result;
}
void HTTPRepoPrivate::flushHashCaches()
{
rootDir->writeHashCache();
for (const auto& dir : directories) {
dir->writeHashCache();
}
}
} // of namespace simgear

View File

@ -54,16 +54,7 @@ typedef SGSharedPtr<HTTPRepoGetRequest> RepoRequestPtr;
class HTTPRepoPrivate {
public:
struct HashCacheEntry {
std::string filePath;
time_t modTime;
size_t lengthBytes;
std::string hashHex;
};
using HashCache = std::unordered_map<std::string, HashCacheEntry>;
HashCache hashes;
int hashCacheDirty = 0;
HTTPRepository::FailureVec failures;
int maxPermittedFailures = 16;
@ -91,12 +82,6 @@ public:
HTTP::Request_ptr updateDir(HTTPDirectory *dir, const std::string &hash,
size_t sz);
std::string hashForPath(const SGPath &p);
void updatedFileContents(const SGPath &p, const std::string &newHash);
void parseHashCache();
std::string computeHashForPath(const SGPath &p);
void writeHashCache();
void failedToGetRootIndex(HTTPRepository::ResultCode st);
void failedToUpdateChild(const SGPath &relativePath,
HTTPRepository::ResultCode fileStatus);
@ -128,6 +113,9 @@ public:
std::deque<HTTPDirectory*> pendingUpdateOfChildren;
SGPath installedCopyPath;
int countDirtyHashCaches() const;
void flushHashCaches();
};
} // namespace simgear