1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
|
/*
* multixact_read_v18.c
*
* Functions to read multixact SLRUs from clusters of PostgreSQL version 18
* and older. In version 19, the multixid offsets were expanded from 32 to 64
* bits.
*
* Copyright (c) 2025, PostgreSQL Global Development Group
* src/bin/pg_upgrade/multixact_read_v18.c
*/
#include "postgres_fe.h"
#include "multixact_read_v18.h"
#include "pg_upgrade.h"
/*
* NOTE: below are a bunch of definitions that are copy-pasted from
* multixact.c from version 18. It's important that this file doesn't
* #include the new definitions with same names from "multixact_internal.h"!
*
* To further avoid confusion in the functions exposed outside this source
* file, we use MultiXactOffset32 to represent the old-style 32-bit multixid
* offsets. The new 64-bit MultiXactOffset should not be used anywhere in
* this file.
*/
#ifdef MULTIXACT_INTERNAL_H
#error multixact_internal.h should not be included in multixact_read_v18.c
#endif
#define MultiXactOffset should_not_be_used
/* We need four bytes per offset and 8 bytes per base for each page. */
#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset32))
static inline int64
MultiXactIdToOffsetPage(MultiXactId multi)
{
return multi / MULTIXACT_OFFSETS_PER_PAGE;
}
static inline int
MultiXactIdToOffsetEntry(MultiXactId multi)
{
return multi % MULTIXACT_OFFSETS_PER_PAGE;
}
/*
* The situation for members is a bit more complex: we store one byte of
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
* are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
* per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
* arithmetic must be done using "char *" pointers.
*/
/* We need eight bits per xact, so one xact fits in a byte */
#define MXACT_MEMBER_BITS_PER_XACT 8
#define MXACT_MEMBER_FLAGS_PER_BYTE 1
#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
/* how many full bytes of flags are there in a group? */
#define MULTIXACT_FLAGBYTES_PER_GROUP 4
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
/* page in which a member is to be found */
static inline int64
MXOffsetToMemberPage(MultiXactOffset32 offset)
{
return offset / MULTIXACT_MEMBERS_PER_PAGE;
}
/* Location (byte offset within page) of flag word for a given member */
static inline int
MXOffsetToFlagsOffset(MultiXactOffset32 offset)
{
MultiXactOffset32 group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
return byteoff;
}
/* Location (byte offset within page) of TransactionId of given member */
static inline int
MXOffsetToMemberOffset(MultiXactOffset32 offset)
{
int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
return MXOffsetToFlagsOffset(offset) +
MULTIXACT_FLAGBYTES_PER_GROUP +
member_in_group * sizeof(TransactionId);
}
static inline int
MXOffsetToFlagsBitShift(MultiXactOffset32 offset)
{
int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
return bshift;
}
/*
* Construct reader of old multixacts.
*
* Returns the malloced memory used by the all other calls in this module.
*/
OldMultiXactReader *
AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti,
MultiXactOffset32 nextOffset)
{
OldMultiXactReader *state = pg_malloc_object(OldMultiXactReader);
char dir[MAXPGPATH] = {0};
state->nextMXact = nextMulti;
state->nextOffset = nextOffset;
pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
state->offset = AllocSlruRead(dir, false);
pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
state->members = AllocSlruRead(dir, false);
return state;
}
/*
* This is a simplified version of the GetMultiXactIdMembers() server
* function:
*
* - Only return the updating member, if any. Upgrade only cares about the
* updaters. If there is no updating member, return somewhat arbitrarily
* the first locking-only member, because we don't have any way to represent
* "no members".
*
* - Because there's no concurrent activity, we don't need to worry about
* locking and some corner cases.
*
* - Don't bail out on invalid entries that could've been left behind after a
* server crash. Such multixids won't appear anywhere else on disk, so the
* server will never try to read them. During upgrade, however, we scan
* through all multixids in order, and will encounter such invalid but
* unreferenced multixids too. We try to distinguish between entries that
* are invalid because of missed disk writes, like entries with zeros in
* offsets or members, and entries that look corrupt in other ways that
* should not happen even on a server crash.
*
* Returns true on success, false if the multixact was invalid.
*/
bool
GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
MultiXactMember *member)
{
MultiXactId nextMXact,
nextOffset,
tmpMXact;
int64 pageno,
prev_pageno;
int entryno,
length;
char *buf;
MultiXactOffset32 *offptr,
offset;
MultiXactOffset32 nextMXOffset;
TransactionId result_xid = InvalidTransactionId;
MultiXactStatus result_status = 0;
nextMXact = state->nextMXact;
nextOffset = state->nextOffset;
/*
* Comment copied from GetMultiXactIdMembers in PostgreSQL v18
* multixact.c:
*
* Find out the offset at which we need to start reading MultiXactMembers
* and the number of members in the multixact. We determine the latter as
* the difference between this multixact's starting offset and the next
* one's. However, there are some corner cases to worry about:
*
* 1. This multixact may be the latest one created, in which case there is
* no next one to look at. The next multixact's offset should be set
* already, as we set it in RecordNewMultiXact(), but we used to not do
* that in older minor versions. To cope with that case, if this
* multixact is the latest one created, use the nextOffset value we read
* above as the endpoint.
*
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
* for to mean "unset", there is an ambiguity near the point of offset
* wraparound. If we see next multixact's offset is one, is that our
* multixact's actual endpoint, or did it end at zero with a subsequent
* increment? We handle this using the knowledge that if the zero'th
* member slot wasn't filled, it'll contain zero, and zero isn't a valid
* transaction ID so it can't be a multixact member. Therefore, if we
* read a zero from the members array, just ignore it.
*/
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);
buf = SlruReadSwitchPage(state->offset, pageno);
offptr = (MultiXactOffset32 *) buf;
offptr += entryno;
offset = *offptr;
if (offset == 0)
{
/* Invalid entry. These can be left behind on a server crash. */
return false;
}
/*
* Use the same increment rule as GetNewMultiXactId(), that is, don't
* handle wraparound explicitly until needed.
*/
tmpMXact = multi + 1;
if (nextMXact == tmpMXact)
{
/* Corner case 1: there is no next multixact */
nextMXOffset = nextOffset;
}
else
{
/* handle wraparound if needed */
if (tmpMXact < FirstMultiXactId)
tmpMXact = FirstMultiXactId;
prev_pageno = pageno;
pageno = MultiXactIdToOffsetPage(tmpMXact);
entryno = MultiXactIdToOffsetEntry(tmpMXact);
if (pageno != prev_pageno)
buf = SlruReadSwitchPage(state->offset, pageno);
offptr = (MultiXactOffset32 *) buf;
offptr += entryno;
nextMXOffset = *offptr;
}
if (nextMXOffset == 0)
{
/* Invalid entry. These can be left behind on a server crash. */
return false;
}
length = nextMXOffset - offset;
if (length < 0)
{
/*
* This entry is corrupt. We should not see these even after a server
* crash.
*/
pg_fatal("multixact %u has an invalid length (%d)", multi, length);
}
if (length == 0)
{
/*
* Invalid entry. The server never writes multixids with zero
* members, but it's not clear if a server crash or using pg_resetwal
* could leave them behind. Seems best to accept them.
*/
return false;
}
/* read the members */
prev_pageno = -1;
for (int i = 0; i < length; i++, offset++)
{
TransactionId *xactptr;
uint32 *flagsptr;
int flagsoff;
int bshift;
int memberoff;
MultiXactStatus status;
pageno = MXOffsetToMemberPage(offset);
memberoff = MXOffsetToMemberOffset(offset);
if (pageno != prev_pageno)
{
buf = SlruReadSwitchPage(state->members, pageno);
prev_pageno = pageno;
}
xactptr = (TransactionId *) (buf + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
/*
* Corner case 2: offset must have wrapped around to unused slot
* zero.
*/
if (offset == 0)
continue;
/*
* Otherwise this is an invalid entry that should not be
* referenced from anywhere in the heap. These can be left behind
* on a server crash. We could return 'false' here, but we prefer
* to continue reading the members and converting them the best we
* can, to preserve evidence in case this is corruption that
* should not have happened.
*/
}
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
flagsptr = (uint32 *) (buf + flagsoff);
status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
/*
* Remember the updating XID among the members, or first locking XID
* if no updating XID.
*/
if (ISUPDATE_from_mxstatus(status))
{
/* sanity check */
if (ISUPDATE_from_mxstatus(result_status))
{
/*
* We don't expect to see more than one updating member, even
* if the server had crashed.
*/
pg_fatal("multixact %u has more than one updating member",
multi);
}
result_xid = *xactptr;
result_status = status;
}
else if (!TransactionIdIsValid(result_xid))
{
result_xid = *xactptr;
result_status = status;
}
}
member->xid = result_xid;
member->status = result_status;
return true;
}
/*
* Frees the malloced reader.
*/
void
FreeOldMultiXactReader(OldMultiXactReader *state)
{
FreeSlruRead(state->offset);
FreeSlruRead(state->members);
pfree(state);
}
|