From b5c4d7bd651d4f9ad1b1f2d3ab20b63ae67275b4 Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Thu, 27 Feb 2025 23:12:20 +0300 Subject: [PATCH 1/6] PostgresNode::pid is improved - We do multiple attempts to read pid file. - We process a case when we see that node is stopped between test and read. - We process a case when pid-file is empty. --- testgres/node.py | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/testgres/node.py b/testgres/node.py index 56899b90..890ffcc6 100644 --- a/testgres/node.py +++ b/testgres/node.py @@ -132,6 +132,9 @@ class PostgresNode(object): # a max number of node start attempts _C_MAX_START_ATEMPTS = 5 + # a max number of read pid file attempts + _C_MAX_GET_PID_ATEMPTS = 5 + def __init__(self, name=None, base_dir=None, port=None, conn_params: ConnectionParams = ConnectionParams(), bin_dir=None, prefix=None): """ @@ -208,14 +211,39 @@ def pid(self): Return postmaster's PID if node is running, else 0. """ - if self.status(): - pid_file = os.path.join(self.data_dir, PG_PID_FILE) - lines = self.os_ops.readlines(pid_file) - pid = int(lines[0]) if lines else None - return pid + nAttempt = 0 + pid_file = os.path.join(self.data_dir, PG_PID_FILE) + pid_s: str = None + while True: + if nAttempt == __class__._C_MAX_GET_PID_ATEMPTS: + errMsg = "Can't read postmaster pid file [{0}].".format(pid_file) + raise Exception(errMsg) - # for clarity - return 0 + nAttempt += 1 + + s1 = self.status() + if s1 != NodeStatus.Running: + return 0 + + try: + lines = self.os_ops.readlines(pid_file) + except Exception: + s2 = self.status() + if s2 == NodeStatus.Running: + raise + return 0 + + assert lines is not None # [2025-02-27] OK? + assert type(lines) == list # noqa: E721 + if len(lines) == 0: + continue + + pid_s = lines[0] + if len(pid_s) == 0: + continue + + pid = int(pid_s) + return pid @property def auxiliary_pids(self): From fce595a854cebe307d8290cc436298a7398c797d Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Thu, 27 Feb 2025 23:51:23 +0300 Subject: [PATCH 2/6] PostgresNode::pid is updated Assert is added. --- testgres/node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testgres/node.py b/testgres/node.py index 890ffcc6..eed54408 100644 --- a/testgres/node.py +++ b/testgres/node.py @@ -239,6 +239,7 @@ def pid(self): continue pid_s = lines[0] + assert type(pid_s) == str # noqa: E721 if len(pid_s) == 0: continue From 0863cf582cc3a57fb1344017ffea86a7355affcb Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Fri, 28 Feb 2025 12:29:47 +0300 Subject: [PATCH 3/6] execute_utility2 is updated (ignore_errors) - New parameters "ignore_errors" is added. Default value is False. - Asserts are added. --- testgres/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/testgres/utils.py b/testgres/utils.py index 9645fc3b..76d42b02 100644 --- a/testgres/utils.py +++ b/testgres/utils.py @@ -73,11 +73,13 @@ def execute_utility(args, logfile=None, verbose=False): return execute_utility2(tconf.os_ops, args, logfile, verbose) -def execute_utility2(os_ops: OsOperations, args, logfile=None, verbose=False): +def execute_utility2(os_ops: OsOperations, args, logfile=None, verbose=False, ignore_errors=False): assert os_ops is not None assert isinstance(os_ops, OsOperations) + assert type(verbose) == bool # noqa: E721 + assert type(ignore_errors) == bool # noqa: E721 - exit_status, out, error = os_ops.exec_command(args, verbose=True) + exit_status, out, error = os_ops.exec_command(args, verbose=True, ignore_errors=ignore_errors) # decode result out = '' if not out else out if isinstance(out, bytes): From 09976aea2d9317f3a2ea2afb4d83267c253e9650 Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Fri, 28 Feb 2025 13:21:50 +0300 Subject: [PATCH 4/6] PostgresNode::_try_shutdown is rewrited (normalization) --- testgres/node.py | 107 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 32 deletions(-) diff --git a/testgres/node.py b/testgres/node.py index 56899b90..cb5862cb 100644 --- a/testgres/node.py +++ b/testgres/node.py @@ -338,41 +338,84 @@ def version(self): return self._pg_version def _try_shutdown(self, max_attempts, with_force=False): + assert type(max_attempts) == int # noqa: E721 + assert type(with_force) == bool # noqa: E721 + assert max_attempts > 0 + attempts = 0 + + # try stopping server N times + while attempts < max_attempts: + attempts += 1 + try: + self.stop() + except ExecUtilException: + continue # one more time + except Exception: + eprint('cannot stop node {}'.format(self.name)) + break + + return # OK + + # If force stopping is enabled and PID is valid + if not with_force: + return False + node_pid = self.pid + assert node_pid is not None + assert type(node_pid) == int # noqa: E721 - if node_pid > 0: - # try stopping server N times - while attempts < max_attempts: - try: - self.stop() - break # OK - except ExecUtilException: - pass # one more time - except Exception: - eprint('cannot stop node {}'.format(self.name)) - break - - attempts += 1 - - # If force stopping is enabled and PID is valid - if with_force and node_pid != 0: - # If we couldn't stop the node - p_status_output = self.os_ops.exec_command(cmd=f'ps -o pid= -p {node_pid}', shell=True, ignore_errors=True).decode('utf-8') - if self.status() != NodeStatus.Stopped and p_status_output and str(node_pid) in p_status_output: - try: - eprint(f'Force stopping node {self.name} with PID {node_pid}') - self.os_ops.kill(node_pid, signal.SIGKILL, expect_error=False) - except Exception: - # The node has already stopped - pass - - # Check that node stopped - print only column pid without headers - p_status_output = self.os_ops.exec_command(f'ps -o pid= -p {node_pid}', shell=True, ignore_errors=True).decode('utf-8') - if p_status_output and str(node_pid) in p_status_output: - eprint(f'Failed to stop node {self.name}.') - else: - eprint(f'Node {self.name} has been stopped successfully.') + if node_pid == 0: + return + + # TODO: [2025-02-28] It is really the old ugly code. We have to rewrite it! + + ps_command = ['ps', '-o', 'pid=', '-p', str(node_pid)] + + ps_output = self.os_ops.exec_command(cmd=ps_command, shell=True, ignore_errors=True).decode('utf-8') + assert type(ps_output) == str # noqa: E721 + + if ps_output == "": + return + + if ps_output != str(node_pid): + __class__._throw_bugcheck__unexpected_result_of_ps( + ps_output, + ps_command) + + try: + eprint('Force stopping node {0} with PID {1}'.format(self.name, node_pid)) + self.os_ops.kill(node_pid, signal.SIGKILL, expect_error=False) + except Exception: + # The node has already stopped + pass + + # Check that node stopped - print only column pid without headers + ps_output = self.os_ops.exec_command(cmd=ps_command, shell=True, ignore_errors=True).decode('utf-8') + assert type(ps_output) == str # noqa: E721 + + if ps_output == "": + eprint('Node {0} has been stopped successfully.'.format(self.name)) + return + + if ps_output == str(node_pid): + eprint('Failed to stop node {0}.'.format(self.name)) + return + + __class__._throw_bugcheck__unexpected_result_of_ps( + ps_output, + ps_command) + + @staticmethod + def _throw_bugcheck__unexpected_result_of_ps(result, cmd): + assert type(result) == str # noqa: E721 + assert type(cmd) == list # noqa: E721 + errLines = [] + errLines.append("[BUG CHECK] Unexpected result of command ps:") + errLines.append(result) + errLines.append("-----") + errLines.append("Command line is {0}".format(cmd)) + raise RuntimeError("\n".join(errLines)) def _assign_master(self, master): """NOTE: this is a private method!""" From 0402c4a6145820043bf94c879921d244ddd8d09a Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Fri, 28 Feb 2025 13:59:41 +0300 Subject: [PATCH 5/6] PostgresNode::pid uses the data from "pg_ctl status" output. --- testgres/consts.py | 4 ++ testgres/node.py | 157 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 31 deletions(-) diff --git a/testgres/consts.py b/testgres/consts.py index 98c84af6..89c49ab7 100644 --- a/testgres/consts.py +++ b/testgres/consts.py @@ -35,3 +35,7 @@ # logical replication settings LOGICAL_REPL_MAX_CATCHUP_ATTEMPTS = 60 + +PG_CTL__STATUS__OK = 0 +PG_CTL__STATUS__NODE_IS_STOPPED = 3 +PG_CTL__STATUS__BAD_DATADIR = 4 diff --git a/testgres/node.py b/testgres/node.py index 6250fd55..744c5516 100644 --- a/testgres/node.py +++ b/testgres/node.py @@ -49,7 +49,9 @@ RECOVERY_CONF_FILE, \ PG_LOG_FILE, \ UTILS_LOG_FILE, \ - PG_PID_FILE + PG_CTL__STATUS__OK, \ + PG_CTL__STATUS__NODE_IS_STOPPED, \ + PG_CTL__STATUS__BAD_DATADIR \ from .consts import \ MAX_LOGICAL_REPLICATION_WORKERS, \ @@ -132,9 +134,6 @@ class PostgresNode(object): # a max number of node start attempts _C_MAX_START_ATEMPTS = 5 - # a max number of read pid file attempts - _C_MAX_GET_PID_ATEMPTS = 5 - def __init__(self, name=None, base_dir=None, port=None, conn_params: ConnectionParams = ConnectionParams(), bin_dir=None, prefix=None): """ @@ -211,40 +210,136 @@ def pid(self): Return postmaster's PID if node is running, else 0. """ - nAttempt = 0 - pid_file = os.path.join(self.data_dir, PG_PID_FILE) - pid_s: str = None + self__data_dir = self.data_dir + + _params = [ + self._get_bin_path('pg_ctl'), + "-D", self__data_dir, + "status" + ] # yapf: disable + + status_code, out, error = execute_utility2( + self.os_ops, + _params, + self.utils_log_file, + verbose=True, + ignore_errors=True) + + assert type(status_code) == int # noqa: E721 + assert type(out) == str # noqa: E721 + assert type(error) == str # noqa: E721 + + # ----------------- + if status_code == PG_CTL__STATUS__NODE_IS_STOPPED: + return 0 + + # ----------------- + if status_code == PG_CTL__STATUS__BAD_DATADIR: + return 0 + + # ----------------- + if status_code != PG_CTL__STATUS__OK: + errMsg = "Getting of a node status [data_dir is {0}] failed.".format(self__data_dir) + + raise ExecUtilException( + message=errMsg, + command=_params, + exit_code=status_code, + out=out, + error=error, + ) + + # ----------------- + assert status_code == PG_CTL__STATUS__OK + + if out == "": + __class__._throw_error__pg_ctl_returns_an_empty_string( + _params + ) + + C_PID_PREFIX = "(PID: " + + i = out.find(C_PID_PREFIX) + + if i == -1: + __class__._throw_error__pg_ctl_returns_an_unexpected_string( + out, + _params + ) + + assert i > 0 + assert i < len(out) + assert len(C_PID_PREFIX) <= len(out) + assert i <= len(out) - len(C_PID_PREFIX) + + i += len(C_PID_PREFIX) + start_pid_s = i + while True: - if nAttempt == __class__._C_MAX_GET_PID_ATEMPTS: - errMsg = "Can't read postmaster pid file [{0}].".format(pid_file) - raise Exception(errMsg) + if i == len(out): + __class__._throw_error__pg_ctl_returns_an_unexpected_string( + out, + _params + ) - nAttempt += 1 + ch = out[i] - s1 = self.status() - if s1 != NodeStatus.Running: - return 0 + if ch == ")": + break - try: - lines = self.os_ops.readlines(pid_file) - except Exception: - s2 = self.status() - if s2 == NodeStatus.Running: - raise - return 0 - - assert lines is not None # [2025-02-27] OK? - assert type(lines) == list # noqa: E721 - if len(lines) == 0: + if ch.isdigit(): + i += 1 continue - pid_s = lines[0] - assert type(pid_s) == str # noqa: E721 - if len(pid_s) == 0: - continue + __class__._throw_error__pg_ctl_returns_an_unexpected_string( + out, + _params + ) + assert False + + if i == start_pid_s: + __class__._throw_error__pg_ctl_returns_an_unexpected_string( + out, + _params + ) - pid = int(pid_s) - return pid + # TODO: Let's verify a length of pid string. + + pid = int(out[start_pid_s:i]) + + if pid == 0: + __class__._throw_error__pg_ctl_returns_a_zero_pid( + out, + _params + ) + + assert pid != 0 + return pid + + @staticmethod + def _throw_error__pg_ctl_returns_an_empty_string(_params): + errLines = [] + errLines.append("Utility pg_ctl returns empty string.") + errLines.append("Command line is {0}".format(_params)) + raise RuntimeError("\n".join(errLines)) + + @staticmethod + def _throw_error__pg_ctl_returns_an_unexpected_string(out, _params): + errLines = [] + errLines.append("Utility pg_ctl returns an unexpected string:") + errLines.append(out) + errLines.append("------------") + errLines.append("Command line is {0}".format(_params)) + raise RuntimeError("\n".join(errLines)) + + @staticmethod + def _throw_error__pg_ctl_returns_a_zero_pid(out, _params): + errLines = [] + errLines.append("Utility pg_ctl returns a zero pid. Output string is:") + errLines.append(out) + errLines.append("------------") + errLines.append("Command line is {0}".format(_params)) + raise RuntimeError("\n".join(errLines)) @property def auxiliary_pids(self): From 45d2b176dc618156312639e41f09c27473266a1f Mon Sep 17 00:00:00 2001 From: "d.kovalenko" Date: Fri, 28 Feb 2025 16:02:54 +0300 Subject: [PATCH 6/6] PostgresNode::_try_shutdown is correct (return None) This method returns nothing (None). --- testgres/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgres/node.py b/testgres/node.py index cb5862cb..5626e2fe 100644 --- a/testgres/node.py +++ b/testgres/node.py @@ -359,7 +359,7 @@ def _try_shutdown(self, max_attempts, with_force=False): # If force stopping is enabled and PID is valid if not with_force: - return False + return node_pid = self.pid assert node_pid is not None