we are running a sparklyr job that runs queries on Cloudera CDP Hive cluster.
The job sometimes stops before a dbWriteTable function, doing nothing and running indefinitely. The job doesn't always stop in the same point, but always during this trywrite function invocation, not catching any error:
trywrite = function(sc, new_name, df, log_obj, wait_sec = 600, max_wait = 3600)
{
start_time = Sys.time()
while (difftime(Sys.time(), start_time, units = 'secs') <= max_wait) {
print(paste0('Attempt to write table: ', new_name, ' - ', Sys.time()))
# Connection is valid?
if (!DBI::dbIsValid(sc)) {
error(log_obj, paste0('Connection not valid during write table: ', new_name))
stop(paste0('Failed to write table: ', new_name))
}
tryCatch({
print(paste0('Writing table: ', new_name))
result = DBI::dbWriteTable(sc, new_name, df)
print(paste0('Write completed table: ', new_name, ' - ', Sys.time()))
return(result)
}, error = function(e) {
error(log_obj, paste0('Connection not valid during write table: ', new_name, ' - ', Sys.time()))
print(paste0('Error message: ', e$message))
print(paste0('Retrying in', wait_sec, ' seconds: ', Sys.time()))
Sys.sleep(wait_sec)
})
}
stop(paste0('Failed to write table before max time: ', new_name))
}