From 3496020d2bc7152a5ad2abae12faea8410a2e263 Mon Sep 17 00:00:00 2001 From: Daniel Shelepanov Date: Thu, 28 Apr 2022 19:06:09 +0300 Subject: [PATCH 1/5] [PBCKP-150] Reading buffer is flushed each time we verify the checksum. The race condition is covered with a unit-test, the buffer is flushed now so each of 300 reads requests the data from the disc. --- src/data.c | 2 ++ tests/__init__.py | 8 ++++- tests/pbckp150.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 tests/pbckp150.py diff --git a/src/data.c b/src/data.c index f02e3fd14..9bb99dabe 100644 --- a/src/data.c +++ b/src/data.c @@ -301,6 +301,8 @@ prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, { /* read the block */ int read_len = fio_pread(in, page, blknum * BLCKSZ); + /* avoid re-reading once buffered data, see PBCKP-150 */ + fflush(in); /* The block could have been truncated. It is fine. */ if (read_len == 0) diff --git a/tests/__init__.py b/tests/__init__.py index 55d6ea9be..868775153 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,7 +7,7 @@ compression, page, ptrack, archive, exclude, cfs_backup, cfs_restore, \ cfs_validate_backup, auth_test, time_stamp, logging, \ locking, remote, external, config, checkdb, set_backup, incr_restore, \ - catchup, CVE_2018_1058 + catchup, CVE_2018_1058, pbckp150 def load_tests(loader, tests, pattern): @@ -21,6 +21,12 @@ def load_tests(loader, tests, pattern): if os.environ['PG_PROBACKUP_PTRACK'] == 'ON': suite.addTests(loader.loadTestsFromModule(ptrack)) + # PG_PROBACKUP_LONG section for tests that are long + # by design e.g. they contain loops, sleeps and so on + if 'PG_PROBACKUP_LONG' in os.environ: + if os.environ['PG_PROBACKUP_LONG'] == 'ON': + suite.addTests(loader.loadTestsFromModule(pbckp150)) + # suite.addTests(loader.loadTestsFromModule(auth_test)) suite.addTests(loader.loadTestsFromModule(archive)) suite.addTests(loader.loadTestsFromModule(backup)) diff --git a/tests/pbckp150.py b/tests/pbckp150.py new file mode 100644 index 000000000..8d2694b9b --- /dev/null +++ b/tests/pbckp150.py @@ -0,0 +1,76 @@ +import os +import unittest +from .helpers.ptrack_helpers import ProbackupTest +import subprocess +from time import sleep + +module_name = 'pbckp150' + +class TestPageCorruptionDueToRace(ProbackupTest, unittest.TestCase): + def test_pbckp150(self): + """ + https://jira.postgrespro.ru/browse/PBCKP-150 + create a node filled with pgbench + create FULL backup followed by PTRACK backup + run pgbench, vacuum VERBOSE FULL and ptrack backups in parallel + """ + # init node + fname = self.id().split('.')[3] + node = self.make_simple_node( + base_dir=os.path.join(module_name, fname, 'node'), + set_replication=True, + initdb_params=['--data-checksums']) + node.append_conf('postgresql.conf', + """ + max_connections = 100 + wal_keep_size = 16000 + ptrack.map_size = 1 + shared_preload_libraries='ptrack' + log_statement = 'none' + fsync = off + log_checkpoints = on + autovacuum = off + """) + + # init probackup and add an instance + backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup') + self.init_pb(backup_dir) + self.add_instance(backup_dir, 'node', node) + + # run the node and init ptrack + node.slow_start() + node.safe_psql("postgres", "CREATE EXTENSION ptrack") + # populate it with pgbench + node.pgbench_init(scale=5) + + # FULL backup followed by PTRACK backup + self.backup_node(backup_dir, 'node', node, options=['--stream']) + self.backup_node(backup_dir, 'node', node, backup_type='ptrack', options=['--stream']) + + # run ordinary pgbench scenario to imitate some activity and another pgbench for vacuuming in parallel + nBenchDuration = 30 + pgbench = node.pgbench(options=['-c', '20', '-j', '8', '-T', str(nBenchDuration)]) + with open('/tmp/pbckp150vacuum.sql', 'w') as f: + f.write('VACUUM (FULL) pgbench_accounts, pgbench_tellers, pgbench_history; SELECT pg_sleep(1);\n') + pgbenchval = node.pgbench(options=['-c', '1', '-f', '/tmp/pbckp150vacuum.sql', '-T', str(nBenchDuration)]) + + # several PTRACK backups + for i in range(nBenchDuration): + print("[{}] backing up PTRACK diff...".format(i+1)) + self.backup_node(backup_dir, 'node', node, backup_type='ptrack', options=['--stream', '--log-level-console', 'VERBOSE']) + sleep(0.1) + # if the activity pgbench has finished, stop backing up + if pgbench.poll() is not None: + break + + pgbench.kill() + pgbenchval.kill() + pgbench.wait() + pgbenchval.wait() + + backups = self.show_pb(backup_dir, 'node') + for b in backups: + self.assertEqual("OK", b['status']) + + # Clean after yourself + self.del_test_dir(module_name, fname) From de8634d562e9463adc8065848b9fe80a0b4a668b Mon Sep 17 00:00:00 2001 From: Daniel Shelepanov Date: Wed, 11 May 2022 11:55:20 +0300 Subject: [PATCH 2/5] [PBCKP-150] review fixes --- src/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/data.c b/src/data.c index 9bb99dabe..9b4640de4 100644 --- a/src/data.c +++ b/src/data.c @@ -301,8 +301,9 @@ prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, { /* read the block */ int read_len = fio_pread(in, page, blknum * BLCKSZ); - /* avoid re-reading once buffered data, see PBCKP-150 */ - fflush(in); + /* avoid re-reading once buffered data, flushing on further attempts, see PBCKP-150 */ + if(try_again < PAGE_READ_ATTEMPTS - 1) + fflush(in); /* The block could have been truncated. It is fine. */ if (read_len == 0) From 9b88a5a0c904020dc70dd866c1ed45a166f0502a Mon Sep 17 00:00:00 2001 From: Daniel Shelepanov Date: Wed, 11 May 2022 12:45:39 +0300 Subject: [PATCH 3/5] [PBCKP-150] review fixes #2 --- src/data.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/data.c b/src/data.c index 9b4640de4..90a5315e2 100644 --- a/src/data.c +++ b/src/data.c @@ -301,9 +301,6 @@ prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, { /* read the block */ int read_len = fio_pread(in, page, blknum * BLCKSZ); - /* avoid re-reading once buffered data, flushing on further attempts, see PBCKP-150 */ - if(try_again < PAGE_READ_ATTEMPTS - 1) - fflush(in); /* The block could have been truncated. It is fine. */ if (read_len == 0) @@ -352,6 +349,8 @@ prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, Assert(false); } } + /* avoid re-reading once buffered data, flushing on further attempts, see PBCKP-150 */ + fflush(in); } /* From 260d63c8a57b32cef153fcb0cdfd010b952bbd98 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 25 May 2022 14:34:36 +0300 Subject: [PATCH 4/5] [PBCKP-150] rename pbckp150.py -> time_consuming.py file for further use in lengthy tests add description for new PG_PROBACKUP_LONG variable adding one test run of this test to travis --- .travis.yml | 17 +++++++++-------- tests/Readme.md | 2 ++ tests/__init__.py | 4 ++-- tests/{pbckp150.py => time_consuming.py} | 4 ++-- 4 files changed, 15 insertions(+), 12 deletions(-) rename tests/{pbckp150.py => time_consuming.py} (96%) diff --git a/.travis.yml b/.travis.yml index 876289e82..74c4ff313 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,14 +26,14 @@ notifications: # Default MODE is basic, i.e. all tests with PG_PROBACKUP_TEST_BASIC=ON env: - - PG_VERSION=15 PG_BRANCH=master PTRACK_PATCH_PG_BRANCH=master - - PG_VERSION=14 PG_BRANCH=REL_14_STABLE PTRACK_PATCH_PG_BRANCH=REL_14_STABLE - - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE - - PG_VERSION=12 PG_BRANCH=REL_12_STABLE PTRACK_PATCH_PG_BRANCH=REL_12_STABLE - - PG_VERSION=11 PG_BRANCH=REL_11_STABLE PTRACK_PATCH_PG_BRANCH=REL_11_STABLE - - PG_VERSION=10 PG_BRANCH=REL_10_STABLE - - PG_VERSION=9.6 PG_BRANCH=REL9_6_STABLE - - PG_VERSION=9.5 PG_BRANCH=REL9_5_STABLE +# - PG_VERSION=15 PG_BRANCH=master PTRACK_PATCH_PG_BRANCH=master +# - PG_VERSION=14 PG_BRANCH=REL_14_STABLE PTRACK_PATCH_PG_BRANCH=REL_14_STABLE +# - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE +# - PG_VERSION=12 PG_BRANCH=REL_12_STABLE PTRACK_PATCH_PG_BRANCH=REL_12_STABLE +# - PG_VERSION=11 PG_BRANCH=REL_11_STABLE PTRACK_PATCH_PG_BRANCH=REL_11_STABLE +# - PG_VERSION=10 PG_BRANCH=REL_10_STABLE +# - PG_VERSION=9.6 PG_BRANCH=REL9_6_STABLE +# - PG_VERSION=9.5 PG_BRANCH=REL9_5_STABLE # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=off MODE=archive # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=backup # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=catchup @@ -46,6 +46,7 @@ env: # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=replica # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=off MODE=retention # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=restore + - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=time_consuming jobs: allow_failures: diff --git a/tests/Readme.md b/tests/Readme.md index 668552c94..c4a67c5d7 100644 --- a/tests/Readme.md +++ b/tests/Readme.md @@ -41,6 +41,8 @@ Run suit of basic simple tests: Run ptrack tests: export PG_PROBACKUP_PTRACK=ON +Run long (time consuming) tests: + export PG_PROBACKUP_LONG=ON Usage: sudo echo 0 > /proc/sys/kernel/yama/ptrace_scope diff --git a/tests/__init__.py b/tests/__init__.py index 868775153..79537ad78 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,7 +7,7 @@ compression, page, ptrack, archive, exclude, cfs_backup, cfs_restore, \ cfs_validate_backup, auth_test, time_stamp, logging, \ locking, remote, external, config, checkdb, set_backup, incr_restore, \ - catchup, CVE_2018_1058, pbckp150 + catchup, CVE_2018_1058, time_consuming def load_tests(loader, tests, pattern): @@ -25,7 +25,7 @@ def load_tests(loader, tests, pattern): # by design e.g. they contain loops, sleeps and so on if 'PG_PROBACKUP_LONG' in os.environ: if os.environ['PG_PROBACKUP_LONG'] == 'ON': - suite.addTests(loader.loadTestsFromModule(pbckp150)) + suite.addTests(loader.loadTestsFromModule(time_consuming)) # suite.addTests(loader.loadTestsFromModule(auth_test)) suite.addTests(loader.loadTestsFromModule(archive)) diff --git a/tests/pbckp150.py b/tests/time_consuming.py similarity index 96% rename from tests/pbckp150.py rename to tests/time_consuming.py index 8d2694b9b..396ab716e 100644 --- a/tests/pbckp150.py +++ b/tests/time_consuming.py @@ -4,9 +4,9 @@ import subprocess from time import sleep -module_name = 'pbckp150' +module_name = 'time_consuming' -class TestPageCorruptionDueToRace(ProbackupTest, unittest.TestCase): +class TimeConsumingTests(ProbackupTest, unittest.TestCase): def test_pbckp150(self): """ https://jira.postgrespro.ru/browse/PBCKP-150 From b7b927bdd5c4b12ef349ae796781a12777a72ddf Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 25 May 2022 14:43:49 +0300 Subject: [PATCH 5/5] [PBCKP-150] revert travis configuration --- .travis.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 74c4ff313..b26bf27c6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,14 +26,14 @@ notifications: # Default MODE is basic, i.e. all tests with PG_PROBACKUP_TEST_BASIC=ON env: -# - PG_VERSION=15 PG_BRANCH=master PTRACK_PATCH_PG_BRANCH=master -# - PG_VERSION=14 PG_BRANCH=REL_14_STABLE PTRACK_PATCH_PG_BRANCH=REL_14_STABLE -# - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE -# - PG_VERSION=12 PG_BRANCH=REL_12_STABLE PTRACK_PATCH_PG_BRANCH=REL_12_STABLE -# - PG_VERSION=11 PG_BRANCH=REL_11_STABLE PTRACK_PATCH_PG_BRANCH=REL_11_STABLE -# - PG_VERSION=10 PG_BRANCH=REL_10_STABLE -# - PG_VERSION=9.6 PG_BRANCH=REL9_6_STABLE -# - PG_VERSION=9.5 PG_BRANCH=REL9_5_STABLE + - PG_VERSION=15 PG_BRANCH=master PTRACK_PATCH_PG_BRANCH=master + - PG_VERSION=14 PG_BRANCH=REL_14_STABLE PTRACK_PATCH_PG_BRANCH=REL_14_STABLE + - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE + - PG_VERSION=12 PG_BRANCH=REL_12_STABLE PTRACK_PATCH_PG_BRANCH=REL_12_STABLE + - PG_VERSION=11 PG_BRANCH=REL_11_STABLE PTRACK_PATCH_PG_BRANCH=REL_11_STABLE + - PG_VERSION=10 PG_BRANCH=REL_10_STABLE + - PG_VERSION=9.6 PG_BRANCH=REL9_6_STABLE + - PG_VERSION=9.5 PG_BRANCH=REL9_5_STABLE # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=off MODE=archive # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=backup # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=catchup @@ -46,7 +46,7 @@ env: # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=replica # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=off MODE=retention # - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=restore - - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=time_consuming +# - PG_VERSION=13 PG_BRANCH=REL_13_STABLE PTRACK_PATCH_PG_BRANCH=REL_13_STABLE MODE=time_consuming jobs: allow_failures: