PostgreSQL Source Code git master
multixact_read_v18.c
Go to the documentation of this file.
1/*
2 * multixact_read_v18.c
3 *
4 * Functions to read multixact SLRUs from clusters of PostgreSQL version 18
5 * and older. In version 19, the multixid offsets were expanded from 32 to 64
6 * bits.
7 *
8 * Copyright (c) 2025, PostgreSQL Global Development Group
9 * src/bin/pg_upgrade/multixact_read_v18.c
10 */
11
12#include "postgres_fe.h"
13
14#include "multixact_read_v18.h"
15#include "pg_upgrade.h"
16
17/*
18 * NOTE: below are a bunch of definitions that are copy-pasted from
19 * multixact.c from version 18. It's important that this file doesn't
20 * #include the new definitions with same names from "multixact_internal.h"!
21 *
22 * To further avoid confusion in the functions exposed outside this source
23 * file, we use MultiXactOffset32 to represent the old-style 32-bit multixid
24 * offsets. The new 64-bit MultiXactOffset should not be used anywhere in
25 * this file.
26 */
27#ifdef MULTIXACT_INTERNAL_H
28#error multixact_internal.h should not be included in multixact_read_v18.c
29#endif
30#define MultiXactOffset should_not_be_used
31
32/* We need four bytes per offset and 8 bytes per base for each page. */
33#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset32))
34
35static inline int64
37{
38 return multi / MULTIXACT_OFFSETS_PER_PAGE;
39}
40
41static inline int
43{
44 return multi % MULTIXACT_OFFSETS_PER_PAGE;
45}
46
47/*
48 * The situation for members is a bit more complex: we store one byte of
49 * additional flag bits for each TransactionId. To do this without getting
50 * into alignment issues, we store four bytes of flags, and then the
51 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
52 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
53 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
54 * performance) trumps space efficiency here.
55 *
56 * Note that the "offset" macros work with byte offset, not array indexes, so
57 * arithmetic must be done using "char *" pointers.
58 */
59/* We need eight bits per xact, so one xact fits in a byte */
60#define MXACT_MEMBER_BITS_PER_XACT 8
61#define MXACT_MEMBER_FLAGS_PER_BYTE 1
62#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
63
64/* how many full bytes of flags are there in a group? */
65#define MULTIXACT_FLAGBYTES_PER_GROUP 4
66#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
67 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
68/* size in bytes of a complete group */
69#define MULTIXACT_MEMBERGROUP_SIZE \
70 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
71#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
72#define MULTIXACT_MEMBERS_PER_PAGE \
73 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
74
75/* page in which a member is to be found */
76static inline int64
78{
79 return offset / MULTIXACT_MEMBERS_PER_PAGE;
80}
81
82/* Location (byte offset within page) of flag word for a given member */
83static inline int
85{
87 int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
88 int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
89
90 return byteoff;
91}
92
93/* Location (byte offset within page) of TransactionId of given member */
94static inline int
96{
97 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
98
99 return MXOffsetToFlagsOffset(offset) +
101 member_in_group * sizeof(TransactionId);
102}
103
104static inline int
106{
107 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
108 int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
109
110 return bshift;
111}
112
113/*
114 * Construct reader of old multixacts.
115 *
116 * Returns the malloced memory used by the all other calls in this module.
117 */
119AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti,
120 MultiXactOffset32 nextOffset)
121{
123 char dir[MAXPGPATH] = {0};
124
125 state->nextMXact = nextMulti;
126 state->nextOffset = nextOffset;
127
128 pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
129 state->offset = AllocSlruRead(dir, false);
130
131 pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
132 state->members = AllocSlruRead(dir, false);
133
134 return state;
135}
136
137/*
138 * This is a simplified version of the GetMultiXactIdMembers() server
139 * function:
140 *
141 * - Only return the updating member, if any. Upgrade only cares about the
142 * updaters. If there is no updating member, return somewhat arbitrarily
143 * the first locking-only member, because we don't have any way to represent
144 * "no members".
145 *
146 * - Because there's no concurrent activity, we don't need to worry about
147 * locking and some corner cases.
148 *
149 * - Don't bail out on invalid entries that could've been left behind after a
150 * server crash. Such multixids won't appear anywhere else on disk, so the
151 * server will never try to read them. During upgrade, however, we scan
152 * through all multixids in order, and will encounter such invalid but
153 * unreferenced multixids too. We try to distinguish between entries that
154 * are invalid because of missed disk writes, like entries with zeros in
155 * offsets or members, and entries that look corrupt in other ways that
156 * should not happen even on a server crash.
157 *
158 * Returns true on success, false if the multixact was invalid.
159 */
160bool
162 MultiXactMember *member)
163{
164 MultiXactId nextMXact,
165 nextOffset,
166 tmpMXact;
167 int64 pageno,
168 prev_pageno;
169 int entryno,
170 length;
171 char *buf;
172 MultiXactOffset32 *offptr,
173 offset;
174 MultiXactOffset32 nextMXOffset;
176 MultiXactStatus result_status = 0;
177
178 nextMXact = state->nextMXact;
179 nextOffset = state->nextOffset;
180
181 /*
182 * Comment copied from GetMultiXactIdMembers in PostgreSQL v18
183 * multixact.c:
184 *
185 * Find out the offset at which we need to start reading MultiXactMembers
186 * and the number of members in the multixact. We determine the latter as
187 * the difference between this multixact's starting offset and the next
188 * one's. However, there are some corner cases to worry about:
189 *
190 * 1. This multixact may be the latest one created, in which case there is
191 * no next one to look at. The next multixact's offset should be set
192 * already, as we set it in RecordNewMultiXact(), but we used to not do
193 * that in older minor versions. To cope with that case, if this
194 * multixact is the latest one created, use the nextOffset value we read
195 * above as the endpoint.
196 *
197 * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
198 * for to mean "unset", there is an ambiguity near the point of offset
199 * wraparound. If we see next multixact's offset is one, is that our
200 * multixact's actual endpoint, or did it end at zero with a subsequent
201 * increment? We handle this using the knowledge that if the zero'th
202 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
203 * transaction ID so it can't be a multixact member. Therefore, if we
204 * read a zero from the members array, just ignore it.
205 */
206
207 pageno = MultiXactIdToOffsetPage(multi);
208 entryno = MultiXactIdToOffsetEntry(multi);
209
210 buf = SlruReadSwitchPage(state->offset, pageno);
211 offptr = (MultiXactOffset32 *) buf;
212 offptr += entryno;
213 offset = *offptr;
214
215 if (offset == 0)
216 {
217 /* Invalid entry. These can be left behind on a server crash. */
218 return false;
219 }
220
221 /*
222 * Use the same increment rule as GetNewMultiXactId(), that is, don't
223 * handle wraparound explicitly until needed.
224 */
225 tmpMXact = multi + 1;
226
227 if (nextMXact == tmpMXact)
228 {
229 /* Corner case 1: there is no next multixact */
230 nextMXOffset = nextOffset;
231 }
232 else
233 {
234 /* handle wraparound if needed */
235 if (tmpMXact < FirstMultiXactId)
236 tmpMXact = FirstMultiXactId;
237
238 prev_pageno = pageno;
239
240 pageno = MultiXactIdToOffsetPage(tmpMXact);
241 entryno = MultiXactIdToOffsetEntry(tmpMXact);
242
243 if (pageno != prev_pageno)
244 buf = SlruReadSwitchPage(state->offset, pageno);
245
246 offptr = (MultiXactOffset32 *) buf;
247 offptr += entryno;
248 nextMXOffset = *offptr;
249 }
250
251 if (nextMXOffset == 0)
252 {
253 /* Invalid entry. These can be left behind on a server crash. */
254 return false;
255 }
256 length = nextMXOffset - offset;
257
258 if (length < 0)
259 {
260 /*
261 * This entry is corrupt. We should not see these even after a server
262 * crash.
263 */
264 pg_fatal("multixact %u has an invalid length (%d)", multi, length);
265 }
266 if (length == 0)
267 {
268 /*
269 * Invalid entry. The server never writes multixids with zero
270 * members, but it's not clear if a server crash or using pg_resetwal
271 * could leave them behind. Seems best to accept them.
272 */
273 return false;
274 }
275
276 /* read the members */
277 prev_pageno = -1;
278 for (int i = 0; i < length; i++, offset++)
279 {
280 TransactionId *xactptr;
281 uint32 *flagsptr;
282 int flagsoff;
283 int bshift;
284 int memberoff;
285 MultiXactStatus status;
286
287 pageno = MXOffsetToMemberPage(offset);
288 memberoff = MXOffsetToMemberOffset(offset);
289
290 if (pageno != prev_pageno)
291 {
292 buf = SlruReadSwitchPage(state->members, pageno);
293 prev_pageno = pageno;
294 }
295
296 xactptr = (TransactionId *) (buf + memberoff);
297 if (!TransactionIdIsValid(*xactptr))
298 {
299 /*
300 * Corner case 2: offset must have wrapped around to unused slot
301 * zero.
302 */
303 if (offset == 0)
304 continue;
305
306 /*
307 * Otherwise this is an invalid entry that should not be
308 * referenced from anywhere in the heap. These can be left behind
309 * on a server crash. We could return 'false' here, but we prefer
310 * to continue reading the members and converting them the best we
311 * can, to preserve evidence in case this is corruption that
312 * should not have happened.
313 */
314 }
315
316 flagsoff = MXOffsetToFlagsOffset(offset);
317 bshift = MXOffsetToFlagsBitShift(offset);
318 flagsptr = (uint32 *) (buf + flagsoff);
319
320 status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
321
322 /*
323 * Remember the updating XID among the members, or first locking XID
324 * if no updating XID.
325 */
326 if (ISUPDATE_from_mxstatus(status))
327 {
328 /* sanity check */
329 if (ISUPDATE_from_mxstatus(result_status))
330 {
331 /*
332 * We don't expect to see more than one updating member, even
333 * if the server had crashed.
334 */
335 pg_fatal("multixact %u has more than one updating member",
336 multi);
337 }
338 result_xid = *xactptr;
339 result_status = status;
340 }
341 else if (!TransactionIdIsValid(result_xid))
342 {
343 result_xid = *xactptr;
344 result_status = status;
345 }
346 }
347
348 member->xid = result_xid;
349 member->status = result_status;
350 return true;
351}
352
353/*
354 * Frees the malloced reader.
355 */
356void
358{
359 FreeSlruRead(state->offset);
360 FreeSlruRead(state->members);
361
362 pfree(state);
363}
int64_t int64
Definition: c.h:549
TransactionId MultiXactId
Definition: c.h:681
uint32_t uint32
Definition: c.h:552
uint32 TransactionId
Definition: c.h:671
#define pg_malloc_object(type)
Definition: fe_memutils.h:50
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:1594
#define FirstMultiXactId
Definition: multixact.h:26
MultiXactStatus
Definition: multixact.h:37
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:51
#define MXACT_MEMBER_BITS_PER_XACT
static int MXOffsetToFlagsBitShift(MultiXactOffset32 offset)
#define MXACT_MEMBER_XACT_BITMASK
#define MULTIXACT_FLAGBYTES_PER_GROUP
static int64 MXOffsetToMemberPage(MultiXactOffset32 offset)
#define MULTIXACT_OFFSETS_PER_PAGE
bool GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, MultiXactMember *member)
static int MXOffsetToMemberOffset(MultiXactOffset32 offset)
static int MultiXactIdToOffsetEntry(MultiXactId multi)
#define MULTIXACT_MEMBERGROUPS_PER_PAGE
static int64 MultiXactIdToOffsetPage(MultiXactId multi)
OldMultiXactReader * AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, MultiXactOffset32 nextOffset)
#define MULTIXACT_MEMBERGROUP_SIZE
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP
#define MULTIXACT_MEMBERS_PER_PAGE
void FreeOldMultiXactReader(OldMultiXactReader *state)
static int MXOffsetToFlagsOffset(MultiXactOffset32 offset)
uint32 MultiXactOffset32
#define pg_fatal(...)
#define MAXPGPATH
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
int int int int pg_sprintf(char *str, const char *fmt,...) pg_attribute_printf(2
SlruSegState * AllocSlruRead(const char *dir, bool long_segment_names)
Definition: slru_io.c:62
void FreeSlruRead(SlruSegState *state)
Definition: slru_io.c:153
static char * SlruReadSwitchPage(SlruSegState *state, uint64 pageno)
Definition: slru_io.h:33
TransactionId xid
Definition: multixact.h:57
MultiXactStatus status
Definition: multixact.h:58
Definition: regguts.h:323
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41