TerraSync: restart after max-errors is exceeded.

When we trip the max-error count for a session, back off for a period
of time and then retry (selecting a new TerraSync server).
This commit is contained in:
James Turner 2020-08-05 15:18:14 +01:00
parent ece7dab47c
commit f72e2ae644
2 changed files with 36 additions and 26 deletions

View File

@ -201,7 +201,6 @@ struct TerrasyncThreadState
_updated_tile_count(0), _updated_tile_count(0),
_success_count(0), _success_count(0),
_consecutive_errors(0), _consecutive_errors(0),
_allowed_errors(6),
_cache_hits(0), _cache_hits(0),
_transfer_rate(0), _transfer_rate(0),
_total_kb_downloaded(0), _total_kb_downloaded(0),
@ -215,7 +214,6 @@ struct TerrasyncThreadState
int _updated_tile_count; int _updated_tile_count;
int _success_count; int _success_count;
int _consecutive_errors; int _consecutive_errors;
int _allowed_errors;
int _cache_hits; int _cache_hits;
int _transfer_rate; int _transfer_rate;
// kbytes, not bytes, because bytes might overflow 2^31 // kbytes, not bytes, because bytes might overflow 2^31
@ -302,12 +300,6 @@ public:
void setInstalledDir(const SGPath& p) { _installRoot = p; } void setInstalledDir(const SGPath& p) { _installRoot = p; }
void setAllowedErrorCount(int errors)
{
std::lock_guard<std::mutex> g(_stateLock);
_state._allowed_errors = errors;
}
void setCacheHits(unsigned int hits) void setCacheHits(unsigned int hits)
{ {
std::lock_guard<std::mutex> g(_stateLock); std::lock_guard<std::mutex> g(_stateLock);
@ -565,6 +557,7 @@ void SGTerraSync::WorkerThread::updateSyncSlot(SyncSlot &slot)
// check result // check result
HTTPRepository::ResultCode res = slot.repository->failure(); HTTPRepository::ResultCode res = slot.repository->failure();
if (res == HTTPRepository::REPO_ERROR_NOT_FOUND) { if (res == HTTPRepository::REPO_ERROR_NOT_FOUND) {
notFound(slot.currentItem); notFound(slot.currentItem);
} else if (res != HTTPRepository::REPO_NO_ERROR) { } else if (res != HTTPRepository::REPO_NO_ERROR) {
@ -631,21 +624,23 @@ void SGTerraSync::WorkerThread::updateSyncSlot(SyncSlot &slot)
void SGTerraSync::WorkerThread::runInternal() void SGTerraSync::WorkerThread::runInternal()
{ {
unsigned dnsRetryCount = 0;
while (!_stop) { while (!_stop) {
// try to find a terrasync server // try to find a terrasync server
if( !hasServer() ) { if( !hasServer() ) {
if( ++dnsRetryCount > 5 ) { const auto haveServer = findServer();
SG_LOG(SG_TERRASYNC, SG_WARN, "Can't find a terrasync server. TS disabled."); if (haveServer) {
break; hasServer(true);
}
if( hasServer( findServer() ) ) { std::lock_guard<std::mutex> g(_stateLock);
SG_LOG(SG_TERRASYNC, SG_INFO, "terrasync scenery provider of the day is '" << _httpServer << "'"); _state._consecutive_errors = 0;
}
continue; SG_LOG(SG_TERRASYNC, SG_INFO, "terrasync scenery provider of the day is '" << _httpServer << "'");
} else {
std::lock_guard<std::mutex> g(_stateLock);
_state._consecutive_errors++;
}
continue;
} }
dnsRetryCount = 0;
try { try {
_http.update(10); _http.update(10);
@ -886,7 +881,6 @@ void SGTerraSync::reinit()
SGPath installPath(_terraRoot->getStringValue("installation-dir")); SGPath installPath(_terraRoot->getStringValue("installation-dir"));
_workerThread->setInstalledDir(installPath); _workerThread->setInstalledDir(installPath);
_workerThread->setAllowedErrorCount(_terraRoot->getIntValue("max-errors",5));
_workerThread->setCacheHits(_terraRoot->getIntValue("cache-hit", 0)); _workerThread->setCacheHits(_terraRoot->getIntValue("cache-hit", 0));
if (_workerThread->start()) if (_workerThread->start())
@ -929,12 +923,7 @@ void SGTerraSync::bind()
_downloadedKBtesNode = _terraRoot->getNode("downloaded-kbytes", true); _downloadedKBtesNode = _terraRoot->getNode("downloaded-kbytes", true);
_enabledNode = _terraRoot->getNode("enabled", true); _enabledNode = _terraRoot->getNode("enabled", true);
_availableNode = _terraRoot->getNode("available", true); _availableNode = _terraRoot->getNode("available", true);
//_busyNode->setAttribute(SGPropertyNode::WRITE, false); _maxErrorsNode = _terraRoot->getNode("max-errors", true);
//_activeNode->setAttribute(SGPropertyNode::WRITE, false);
//_updateCountNode->setAttribute(SGPropertyNode::WRITE, false);
//_errorCountNode->setAttribute(SGPropertyNode::WRITE, false);
//_tileCountNode->setAttribute(SGPropertyNode::WRITE, false);
} }
void SGTerraSync::unbind() void SGTerraSync::unbind()
@ -954,6 +943,11 @@ void SGTerraSync::update(double)
auto enabled = _enabledNode->getBoolValue(); auto enabled = _enabledNode->getBoolValue();
auto worker_running = _workerThread->isRunning(); auto worker_running = _workerThread->isRunning();
// hold enabled false until retry time passes
if (enabled && (_retryTime > SGTimeStamp::now())) {
enabled = false;
}
// see if the enabled status has changed; and if so take the appropriate action. // see if the enabled status has changed; and if so take the appropriate action.
if (enabled && !worker_running) if (enabled && !worker_running)
{ {
@ -979,6 +973,16 @@ void SGTerraSync::update(double)
_stalledNode->setBoolValue(_workerThread->isStalled()); _stalledNode->setBoolValue(_workerThread->isStalled());
_activeNode->setBoolValue(worker_running); _activeNode->setBoolValue(worker_running);
int allowedErrors = _maxErrorsNode->getIntValue();
if (worker_running && (copiedState._consecutive_errors >= allowedErrors)) {
_workerThread->stop();
_retryBackOffSeconds = std::min(_retryBackOffSeconds + 60, 60u * 15);
const int seconds = static_cast<int>(sg_random() * _retryBackOffSeconds);
_retryTime = SGTimeStamp::now() + SGTimeStamp::fromSec(seconds);
SG_LOG(SG_TERRASYNC, SG_ALERT, "Terrasync paused due to " << copiedState._consecutive_errors << " consecutive errors during sync; will retry in " << seconds << " seconds.");
}
while (_workerThread->hasNewTiles()) while (_workerThread->hasNewTiles())
{ {
// ensure they are popped // ensure they are popped

View File

@ -107,6 +107,7 @@ private:
SGPropertyNode_ptr _transferRateBytesSecNode; SGPropertyNode_ptr _transferRateBytesSecNode;
SGPropertyNode_ptr _pendingKbytesNode; SGPropertyNode_ptr _pendingKbytesNode;
SGPropertyNode_ptr _downloadedKBtesNode; SGPropertyNode_ptr _downloadedKBtesNode;
SGPropertyNode_ptr _maxErrorsNode;
// we manually bind+init TerraSync during early startup // we manually bind+init TerraSync during early startup
// to get better overlap of slow operations (Shared Models sync // to get better overlap of slow operations (Shared Models sync
@ -116,6 +117,11 @@ private:
simgear::TiedPropertyList _tiedProperties; simgear::TiedPropertyList _tiedProperties;
BufferedLogCallback* _log; BufferedLogCallback* _log;
/// if we disabled TerraSync due to errors, this is the time at which we will restart it
/// automatically.
SGTimeStamp _retryTime;
unsigned int _retryBackOffSeconds = 0;
}; };
} }