Skip to content

Commit 4e9821b

Browse files
committed
Restore replication protocol's duplicate command tags
I removed the duplicate command tags for START_REPLICATION inadvertently in commit 07082b0, but the replication protocol requires them. The fact that the replication protocol was broken was not noticed because all our test cases use an optimized code path that exits early, failing to verify that the behavior is correct for non-optimized cases. Put them back. Also document this protocol quirk. Add a test case that shows the failure. It might still succeed even without the patch when run on a fast enough server, but it suffices to show the bug in enough cases that it would be noticed in buildfarm. Author: Álvaro Herrera <[email protected]> Reported-by: Henry Hinze <[email protected]> Reviewed-by: Petr Jelínek <[email protected]> Discussion: https://postgr.es/m/[email protected]
1 parent b94109c commit 4e9821b

File tree

4 files changed

+61
-6
lines changed

4 files changed

+61
-6
lines changed

doc/src/sgml/protocol.sgml

+5-3
Original file line numberDiff line numberDiff line change
@@ -2059,8 +2059,9 @@ The commands accepted in replication mode are:
20592059
the switch position is the end of the WAL that was streamed, but there
20602060
are corner cases where the server can send some WAL from the old
20612061
timeline that it has not itself replayed before promoting. Finally, the
2062-
server sends CommandComplete message, and is ready to accept a new
2063-
command.
2062+
server sends two CommandComplete messages (one that ends the CopyData
2063+
and the other ends the <literal>START_REPLICATION</literal> itself), and
2064+
is ready to accept a new command.
20642065
</para>
20652066

20662067
<para>
@@ -2382,7 +2383,8 @@ The commands accepted in replication mode are:
23822383

23832384
<para>
23842385
The messages inside the CopyBothResponse messages are of the same format
2385-
documented for <literal>START_REPLICATION ... PHYSICAL</literal>.
2386+
documented for <literal>START_REPLICATION ... PHYSICAL</literal>, including
2387+
two CommandComplete messages.
23862388
</para>
23872389

23882390
<para>

src/backend/replication/logical/worker.c

-1
Original file line numberDiff line numberDiff line change
@@ -3071,7 +3071,6 @@ ApplyWorkerMain(Datum main_arg)
30713071
* does some initializations on the upstream so let's still call it.
30723072
*/
30733073
(void) walrcv_identify_system(wrconn, &startpointTLI);
3074-
30753074
}
30763075

30773076
/*

src/backend/replication/walsender.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,8 @@ exec_replication_command(const char *cmd_string)
16561656
else
16571657
StartLogicalReplication(cmd);
16581658

1659-
/* callees already sent their own completion message */
1659+
/* dupe, but necessary per libpqrcv_endstreaming */
1660+
EndReplicationCommand(cmdtag);
16601661

16611662
Assert(xlogreader != NULL);
16621663
break;

src/test/subscription/t/100_bugs.pl

+54-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use warnings;
44
use PostgresNode;
55
use TestLib;
6-
use Test::More tests => 3;
6+
use Test::More tests => 5;
77

88
# Bug #15114
99

@@ -100,3 +100,56 @@
100100
);
101101

102102
$node_publisher->stop('fast');
103+
104+
# Bug #16643 - https://postgr.es/m/[email protected]
105+
#
106+
# Initial sync doesn't complete; the protocol was not being followed per
107+
# expectations after commit 07082b08cc5d.
108+
my $node_twoways = get_new_node('twoways');
109+
$node_twoways->init(allows_streaming => 'logical');
110+
$node_twoways->start;
111+
for my $db (qw(d1 d2))
112+
{
113+
$node_twoways->safe_psql('postgres', "CREATE DATABASE $db");
114+
$node_twoways->safe_psql($db, "CREATE TABLE t (f int)");
115+
$node_twoways->safe_psql($db, "CREATE TABLE t2 (f int)");
116+
}
117+
118+
my $rows = 3000;
119+
$node_twoways->safe_psql(
120+
'd1', qq{
121+
INSERT INTO t SELECT * FROM generate_series(1, $rows);
122+
INSERT INTO t2 SELECT * FROM generate_series(1, $rows);
123+
CREATE PUBLICATION testpub FOR TABLE t;
124+
SELECT pg_create_logical_replication_slot('testslot', 'pgoutput');
125+
});
126+
127+
$node_twoways->safe_psql('d2',
128+
"CREATE SUBSCRIPTION testsub CONNECTION \$\$"
129+
. $node_twoways->connstr('d1')
130+
. "\$\$ PUBLICATION testpub WITH (create_slot=false, "
131+
. "slot_name='testslot')");
132+
$node_twoways->safe_psql(
133+
'd1', qq{
134+
INSERT INTO t SELECT * FROM generate_series(1, $rows);
135+
INSERT INTO t2 SELECT * FROM generate_series(1, $rows);
136+
});
137+
$node_twoways->safe_psql(
138+
'd1', 'ALTER PUBLICATION testpub ADD TABLE t2');
139+
$node_twoways->safe_psql(
140+
'd2', 'ALTER SUBSCRIPTION testsub REFRESH PUBLICATION');
141+
142+
# We cannot rely solely on wait_for_catchup() here; it isn't sufficient
143+
# when tablesync workers might still be running. So in addition to that,
144+
# we verify that no tablesync workers appear for the subscription.
145+
# XXX maybe this should be integrated in wait_for_catchup() itself.
146+
$node_twoways->wait_for_catchup('testsub');
147+
$node_twoways->poll_query_until(
148+
'd2',
149+
"SELECT count(*) FROM pg_stat_subscription WHERE subname = 'testsub' AND relid <> 0",
150+
"0");
151+
152+
is($node_twoways->safe_psql('d2', "SELECT count(f) FROM t"),
153+
$rows * 2, "2x$rows rows in t");
154+
is($node_twoways->safe_psql('d2', "SELECT count(f) FROM t2"),
155+
$rows * 2, "2x$rows rows in t2");

0 commit comments

Comments
 (0)