diff --git a/doc/src/sgml/ref/pg_resetwal.sgml b/doc/src/sgml/ref/pg_resetwal.sgml index 2c019c2aac6e..41f2b1d480c5 100644 --- a/doc/src/sgml/ref/pg_resetwal.sgml +++ b/doc/src/sgml/ref/pg_resetwal.sgml @@ -267,14 +267,17 @@ PostgreSQL documentation A safe value for the next multitransaction ID (first part) can be determined by looking for the numerically largest file name in the directory pg_multixact/offsets under the data directory, - adding one, and then multiplying by 65536 (0x10000). Conversely, a safe + adding one, and then multiplying by 32768 (0x8000). Conversely, a safe value for the oldest multitransaction ID (second part of ) can be determined by looking for the numerically smallest - file name in the same directory and multiplying by 65536. The file - names are in hexadecimal, so the easiest way to do this is to specify - the option value in hexadecimal and append four zeroes. + file name in the same directory and multiplying by 32768 (0x8000). + Note that the file names are in hexadecimal. It is usually easiest + to specify the option value in hexadecimal too. For example, if + 000F and 0007 are the greatest and + smallest entries in pg_multixact/offsets, + -m 0x80000,0x38000 will work. - + diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 3ca0582db364..052dd0a4ce56 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,7 +65,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, + appendStringInfo(buf, "%u offset %" PRIu64 " nmembers %d: ", xlrec->mid, xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); @@ -74,7 +74,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + appendStringInfo(buf, "offsets [%u, %u), members [%" PRIu64 ", %" PRIu64 ")", xlrec->startTruncOff, xlrec->endTruncOff, xlrec->startTruncMemb, xlrec->endTruncMemb); } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index cd6c2a2f650a..441034f5929c 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -66,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%08X; " - "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " + "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 8ed3fd9d071c..dc9c4257a98e 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -69,6 +69,7 @@ #include "postgres.h" #include "access/multixact.h" +#include "access/multixact_internal.h" #include "access/slru.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" @@ -89,128 +90,13 @@ /* - * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is - * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). + * Thresholds used to keep members disk usage in check when multixids have a + * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum + * starts freezing multixids more aggressively, even if the normal multixid + * age limits haven't been reached yet. */ - -/* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) - -static inline int64 -MultiXactIdToOffsetPage(MultiXactId multi) -{ - return multi / MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int -MultiXactIdToOffsetEntry(MultiXactId multi) -{ - return multi % MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int64 -MultiXactIdToOffsetSegment(MultiXactId multi) -{ - return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; -} - -/* - * The situation for members is a bit more complex: we store one byte of - * additional flag bits for each TransactionId. To do this without getting - * into alignment issues, we store four bytes of flags, and then the - * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups - * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and - * performance) trumps space efficiency here. - * - * Note that the "offset" macros work with byte offset, not array indexes, so - * arithmetic must be done using "char *" pointers. - */ -/* We need eight bits per xact, so one xact fits in a byte */ -#define MXACT_MEMBER_BITS_PER_XACT 8 -#define MXACT_MEMBER_FLAGS_PER_BYTE 1 -#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) - -/* how many full bytes of flags are there in a group? */ -#define MULTIXACT_FLAGBYTES_PER_GROUP 4 -#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ - (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) -/* size in bytes of a complete group */ -#define MULTIXACT_MEMBERGROUP_SIZE \ - (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) -#define MULTIXACT_MEMBERS_PER_PAGE \ - (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) - -/* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. - */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - -/* page in which a member is to be found */ -static inline int64 -MXOffsetToMemberPage(MultiXactOffset offset) -{ - return offset / MULTIXACT_MEMBERS_PER_PAGE; -} - -static inline int64 -MXOffsetToMemberSegment(MultiXactOffset offset) -{ - return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; -} - -/* Location (byte offset within page) of flag word for a given member */ -static inline int -MXOffsetToFlagsOffset(MultiXactOffset offset) -{ - MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; - int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; - - return byteoff; -} - -static inline int -MXOffsetToFlagsBitShift(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; - - return bshift; -} - -/* Location (byte offset within page) of TransactionId of given member */ -static inline int -MXOffsetToMemberOffset(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - - return MXOffsetToFlagsOffset(offset) + - MULTIXACT_FLAGBYTES_PER_GROUP + - member_in_group * sizeof(TransactionId); -} - -/* Multixact members wraparound thresholds. */ -#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) -#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ - (MaxMultiXactOffset - MaxMultiXactOffset / 4) +#define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000) +#define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000) static inline MultiXactId PreviousMultiXactId(MultiXactId multi) @@ -255,11 +141,9 @@ typedef struct MultiXactStateData /* * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. + * referenced by a relation. */ MultiXactOffset oldestOffset; - bool oldestOffsetKnown; /* support for anti-wraparound measures */ MultiXactId multiVacLimit; @@ -267,9 +151,6 @@ typedef struct MultiXactStateData MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -390,13 +271,9 @@ static void mXactCachePut(MultiXactId multi, int nmembers, /* management of SLRU infrastructure */ static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); -static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, - MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(bool is_startup); +static void SetOffsetVacuumLimit(void); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, @@ -1191,90 +1068,22 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) ExtendMultiXactOffset(result + 1); /* - * Reserve the members space, similarly to above. Also, be careful not to - * return zero as the starting offset for any multixact. See - * GetMultiXactIdMembers() for motivation. + * Reserve the members space, similarly to above. */ nextOffset = MultiXactState->nextOffset; - if (nextOffset == 0) - { - *offset = 1; - nmembers++; /* allocate member slot 0 too */ - } - else - *offset = nextOffset; - - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. + * Offsets are 64-bit integers and will never wrap around. Firstly, it + * would take an unrealistic amount of time and resources to consume 2^64 + * offsets. Secondly, multixid creation is WAL-logged, so you would run + * out of LSNs before reaching offset wraparound. Nevertheless, check for + * wraparound as a sanity check. */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, + if (nextOffset + nmembers < nextOffset) + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); + errmsg("MultiXact members would wrap around"))); + *offset = nextOffset; ExtendMultiXactMember(nextOffset, nmembers); @@ -1295,8 +1104,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * the next iteration. But note that nextMXact may be InvalidMultiXactId * or the first value on a segment-beginning page after this routine * exits, so anyone else looking at the variable must be prepared to deal - * with either case. Similarly, nextOffset may be zero, but we won't use - * that as the actual start offset of the next multixact. + * with either case. */ (MultiXactState->nextMXact)++; @@ -1304,7 +1112,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64, + result, *offset); return result; } @@ -1345,8 +1154,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, int slotno; MultiXactOffset *offptr; MultiXactOffset offset; + MultiXactOffset nextMXOffset; int length; - int truelength; MultiXactId oldestMXact; MultiXactId nextMXact; MultiXactMember *ptr; @@ -1422,16 +1231,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * Find out the offset at which we need to start reading MultiXactMembers * and the number of members in the multixact. We determine the latter as * the difference between this multixact's starting offset and the next - * one's. However, there is one corner case to worry about: - * - * Because GetNewMultiXactId skips over offset zero, to reserve zero for - * to mean "unset", there is an ambiguity near the point of offset - * wraparound. If we see next multixact's offset is one, is that our - * multixact's actual endpoint, or did it end at zero with a subsequent - * increment? We handle this using the knowledge that if the zero'th - * member slot wasn't filled, it'll contain zero, and zero isn't a valid - * transaction ID so it can't be a multixact member. Therefore, if we - * read a zero from the members array, just ignore it. + * one's. */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1446,12 +1246,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, offptr += entryno; offset = *offptr; - Assert(offset != 0); + if (offset == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid offset", multi))); /* read next multi's offset */ { MultiXactId tmpMXact; - MultiXactOffset nextMXOffset; /* handle wraparound if needed */ tmpMXact = multi + 1; @@ -1485,23 +1287,30 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; nextMXOffset = *offptr; - - if (nextMXOffset == 0) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("MultiXact %u has invalid next offset", - multi))); - - length = nextMXOffset - offset; } LWLockRelease(lock); lock = NULL; + /* Sanity check the next offset */ + if (nextMXOffset == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", multi))); + if (nextMXOffset < offset) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has offset (%" PRIu64") greater than its next offset (%" PRIu64")", + multi, offset, nextMXOffset))); + if (nextMXOffset - offset > INT32_MAX) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has too many members (%" PRIu64 ")", + multi, nextMXOffset - offset))); + length = nextMXOffset - offset; + /* read the members */ ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - - truelength = 0; prev_pageno = -1; for (int i = 0; i < length; i++, offset++) { @@ -1538,37 +1347,27 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, xactptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - - if (!TransactionIdIsValid(*xactptr)) - { - /* Corner case: we must be looking at unused slot zero */ - Assert(offset == 0); - continue; - } + Assert(TransactionIdIsValid(*xactptr)); flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); - ptr[truelength].xid = *xactptr; - ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; - truelength++; + ptr[i].xid = *xactptr; + ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; } LWLockRelease(lock); - /* A multixid with zero members should not happen */ - Assert(truelength > 0); - /* * Copy the result into the local cache. */ - mXactCachePut(multi, truelength, ptr); + mXactCachePut(multi, length, ptr); debug_elog3(DEBUG2, "GetMembers: no cache for %s", - mxid_to_string(multi, truelength, ptr)); + mxid_to_string(multi, length, ptr)); *members = ptr; - return truelength; + return length; } /* @@ -1975,7 +1774,7 @@ MultiXactShmemInit(void) "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, LWTRANCHE_MULTIXACTMEMBER_SLRU, SYNC_HANDLER_MULTIXACT_MEMBER, - false); + true); /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ /* Initialize our shared state struct */ @@ -2030,48 +1829,6 @@ BootStrapMultiXact(void) SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } -/* - * MaybeExtendOffsetSlru - * Extend the offsets SLRU area, if necessary - * - * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might - * contain files that are shorter than necessary; this would occur if the old - * installation had used multixacts beyond the first page (files cannot be - * copied, because the on-disk representation is different). pg_upgrade would - * update pg_control to set the next offset value to be at that position, so - * that tuples marked as locked by such MultiXacts would be seen as visible - * without having to consult multixact. However, trying to create and use a - * new MultiXactId would result in an error because the page on which the new - * value would reside does not exist. This routine is in charge of creating - * such pages. - */ -static void -MaybeExtendOffsetSlru(void) -{ - int64 pageno; - LWLock *lock; - - pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - { - int slotno; - - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - } - - LWLockRelease(lock); -} - /* * This must be called ONCE during postmaster or standalone-backend startup. * @@ -2210,8 +1967,8 @@ TrimMultiXact(void) MultiXactState->finishedStartup = true; LWLockRelease(MultiXactGenLock); - /* Now compute how far away the next members wraparound is. */ - SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); + /* Now compute how far away the next multixid wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB); } /* @@ -2232,7 +1989,7 @@ MultiXactGetCheckptMulti(bool is_shutdown, LWLockRelease(MultiXactGenLock); debug_elog6(DEBUG2, - "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u", *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); } @@ -2267,26 +2024,12 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64, nextMulti, nextMultiOffset); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; LWLockRelease(MultiXactGenLock); - - /* - * During a binary upgrade, make sure that the offsets SLRU is large - * enough to contain the next value that would be created. - * - * We need to do this pretty early during the first startup in binary - * upgrade mode: before StartupMultiXact() in fact, because this routine - * is called even before that by StartupXLOG(). And we can't do it - * earlier than at this point, because during that first call of this - * routine we determine the MultiXactState->nextMXact value that - * MaybeExtendOffsetSlru needs. - */ - if (IsBinaryUpgrade) - MaybeExtendOffsetSlru(); } /* @@ -2294,28 +2037,24 @@ MultiXactSetNextMXact(MultiXactId nextMulti, * datminmxid (ie, the oldest MultiXactId that might exist in any database * of our cluster), and the OID of the (or a) database with that value. * - * is_startup is true when we are just starting the cluster, false when we - * are updating state in a running cluster. This only affects log messages. + * This also updates MultiXactState->oldestOffset, by looking up the offset of + * MultiXactState->oldestMultiXactId. */ void -SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, - bool is_startup) +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) { MultiXactId multiVacLimit; MultiXactId multiWarnLimit; MultiXactId multiStopLimit; MultiXactId multiWrapLimit; MultiXactId curMulti; - bool needs_offset_vacuum; Assert(MultiXactIdIsValid(oldest_datminmxid)); /* * We pretend that a wrap will happen halfway through the multixact ID * space, but that's not really true, because multixacts wrap differently - * from transaction IDs. Note that, separately from any concern about - * multixact IDs wrapping, we must ensure that multixact members do not - * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. + * from transaction IDs. */ multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); if (multiWrapLimit < FirstMultiXactId) @@ -2383,8 +2122,13 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, Assert(!InRecovery); - /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); + /* + * Offsets are 64-bits wide and never wrap around, so we don't need to + * consider them for emergency autovacuum purposes. But now that we're in + * a consistent state, determine MultiXactState->oldestOffset, to be used + * to calculate freezing cutoff to keep the offsets disk usage in check. + */ + SetOffsetVacuumLimit(); /* * If past the autovacuum force point, immediately signal an autovac @@ -2393,8 +2137,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, * database, it'll call here, and we'll signal the postmaster to start * another iteration immediately if there are still any old databases. */ - if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || - needs_offset_vacuum) && IsUnderPostmaster) + if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); /* Give an immediate warning if past the wrap warn point */ @@ -2456,9 +2199,9 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); MultiXactState->nextMXact = minMulti; } - if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + if (MultiXactState->nextOffset < minMultiOffset) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIU64, minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } @@ -2477,7 +2220,7 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) Assert(InRecovery); if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) - SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); + SetMultiXactIdLimit(oldestMulti, oldestMultiDB); } /* @@ -2560,27 +2303,11 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockRelease(lock); } - /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. - */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + /* Compute the number of items till end of current page. */ + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* - * Advance to next page, taking care to properly handle the wraparound - * case. OK if nmembers goes negative. + * Advance to next page. OK if nmembers goes negative. */ nmembers -= difference; offset += difference; @@ -2642,28 +2369,17 @@ GetOldestMultiXactId(void) } /* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. - * - * To do so determine what's the oldest member offset and install the limit - * info in MultiXactState, where it can be used to prevent overrun of old data - * in the members SLRU area. - * - * The return value is true if emergency autovacuum is required and false - * otherwise. + * Calculate the oldest member offset and install it in MultiXactState, where + * it can be used to adjust multixid freezing cutoffs. */ -static bool -SetOffsetVacuumLimit(bool is_startup) +static void +SetOffsetVacuumLimit(void) { MultiXactId oldestMultiXactId; MultiXactId nextMXact; MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2676,9 +2392,6 @@ SetOffsetVacuumLimit(bool is_startup) oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2701,121 +2414,39 @@ SetOffsetVacuumLimit(bool is_startup) else { /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. + * Look up the offset at which the oldest existing multixact's members + * are stored. If we cannot find it, be careful not to fail, and + * leave oldestOffset unchanged. oldestOffset is initialized to zero + * at system startup, which prevents truncating members until a proper + * value is calculated. + * + * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where + * the supposedly-earliest multixact might not really exist. Those + * should be long gone by now, so this should not fail, but let's + * still be defensive.) */ oldestOffsetKnown = find_multixact_start(oldestMultiXactId, &oldestOffset); if (oldestOffsetKnown) ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", + (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64, oldestOffset))); else ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + (errmsg("oldest checkpointed MultiXact %u does not exist on disk", oldestMultiXactId))); } LWLockRelease(MultiXactTruncationLock); - /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ + /* Install the computed value */ if (oldestOffsetKnown) { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + LWLockRelease(MultiXactGenLock); } - - /* Install the computed values */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; - LWLockRelease(MultiXactGenLock); - - /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. - */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; } /* @@ -2869,37 +2500,23 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) * members: Number of member entries (nextOffset - oldestOffset) * oldestMultiXactId: Oldest MultiXact ID still in use * oldestOffset: Oldest offset still in use - * - * Returns false if unable to determine, the oldest offset being unknown. */ -bool +void GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset) { MultiXactOffset nextOffset; MultiXactId nextMultiXactId; - bool oldestOffsetKnown; LWLockAcquire(MultiXactGenLock, LW_SHARED); nextOffset = MultiXactState->nextOffset; *oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMultiXactId = MultiXactState->nextMXact; *oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; LWLockRelease(MultiXactGenLock); - if (!oldestOffsetKnown) - { - *members = 0; - *multixacts = 0; - *oldestMultiXactId = InvalidMultiXactId; - *oldestOffset = 0; - return false; - } - *members = nextOffset - *oldestOffset; *multixacts = nextMultiXactId - *oldestMultiXactId; - return true; } /* @@ -2908,26 +2525,27 @@ GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, * vacuum_multixact_freeze_table_age work together to make sure we never have * too many multixacts; we hope that, at least under normal circumstances, * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger relminmxid advancement by VACUUM. At that point, we'd have no - * choice but to start failing multixact-creating operations with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_get_cutoffs() will clamp the - * freeze table and the minimum freeze age cutoffs based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will freeze every multixact. + * However, if the average multixact has many members, we might accumulate a + * large amount of members, consuming disk space, while still using few enough + * multixids that the multixid limits fail to trigger relminmxid advancement + * by VACUUM. + * + * To prevent that, if the members space usage exceeds a threshold + * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce + * autovacuum_multixact_freeze_max_age to a value just less than the number of + * multixacts in use. We hope that this will quickly trigger autovacuuming on + * the table or tables with the oldest relminmxid, thus allowing datminmxid + * values to advance and removing some members. + * + * As the amount of the member space in use grows, we become more aggressive + * in clamping this value. That not only causes autovacuum to ramp up, but + * also makes any manual vacuums the user issues more aggressive. This + * happens because vacuum_get_cutoffs() will clamp the freeze table and the + * minimum freeze age cutoffs based on the effective + * autovacuum_multixact_freeze_max_age this function returns. At the extreme, + * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp + * freeze_max_age to zero, and every vacuum of any table will freeze every + * multixact. */ int MultiXactMemberFreezeThreshold(void) @@ -2940,21 +2558,27 @@ MultiXactMemberFreezeThreshold(void) MultiXactId oldestMultiXactId; MultiXactOffset oldestOffset; - /* If we can't determine member space utilization, assume the worst. */ - if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset)) - return 0; + /* Read the current offsets and members usage. */ + GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset); /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD) return autovacuum_multixact_freeze_max_age; /* * Compute a target for relminmxid advancement. The number of multixacts * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. - */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + * MULTIXACT_MEMBER_LOW_THRESHOLD. + * + * The way this formula works is that when members is exactly at the low + * threshold, fraction = 0.0, and we set freeze_max_age equal to + * mxid_age(oldestMultiXactId). As members grows further, towards the + * high threshold, fraction grows linearly from 0.0 to 1.0, and the result + * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high + * threshold, fraction > 1.0 and the result is clamped to 0. + */ + fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) / + (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD); victim_multixacts = multixacts * fraction; /* fraction could be > 1.0, but lowest possible freeze age is zero */ @@ -2995,36 +2619,12 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). */ static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int64 startsegment = MXOffsetToMemberSegment(oldestOffset); - int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); - int64 segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %" PRIx64, - segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3168,7 +2768,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) elog(DEBUG1, "performing multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", oldestMulti, newOldestMulti, MultiXactIdToOffsetSegment(oldestMulti), MultiXactIdToOffsetSegment(newOldestMulti), @@ -3209,6 +2809,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestMultiXactId = newOldestMulti; MultiXactState->oldestMultiXactDB = newOldestMultiDB; + MultiXactState->oldestOffset = newOldestOffset; LWLockRelease(MultiXactGenLock); /* First truncate members */ @@ -3248,20 +2849,13 @@ MultiXactOffsetPagePrecedes(int64 page1, int64 page2) /* * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. + * purposes. There is no "invalid offset number" and members never wrap + * around, so use the numbers verbatim. */ static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2) { - MultiXactOffset offset1; - MultiXactOffset offset2; - - offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; - - return (MultiXactOffsetPrecedes(offset1, offset2) && - MultiXactOffsetPrecedes(offset1, - offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); + return page1 < page2; } /* @@ -3293,17 +2887,6 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) } -/* - * Decide which of two offsets is earlier. - */ -static bool -MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) -{ - int32 diff = (int32) (offset1 - offset2); - - return (diff < 0); -} - /* * Write a TRUNCATE xlog record * @@ -3396,7 +2979,7 @@ multixact_redo(XLogReaderState *record) elog(DEBUG1, "replaying multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", xlrec.startTruncOff, xlrec.endTruncOff, MultiXactIdToOffsetSegment(xlrec.startTruncOff), MultiXactIdToOffsetSegment(xlrec.endTruncOff), @@ -3411,7 +2994,7 @@ multixact_redo(XLogReaderState *record) * Advance the horizon values, so they're current at the end of * recovery. */ - SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB); PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 22d0a2e8c3a6..a000b8bd509a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5139,7 +5139,7 @@ BootStrapXLOG(uint32 data_checksum_version) FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = FirstMultiXactId; - checkPoint.nextMultiOffset = 0; + checkPoint.nextMultiOffset = 1; checkPoint.oldestXid = FirstNormalTransactionId; checkPoint.oldestXidDB = Template1DbOid; checkPoint.oldestMulti = FirstMultiXactId; @@ -5155,7 +5155,7 @@ BootStrapXLOG(uint32 data_checksum_version) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ @@ -5636,7 +5636,7 @@ StartupXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 21b8f179ba0d..51dea342a4d1 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -886,7 +886,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64, checkPoint.nextMulti, checkPoint.nextMultiOffset))); ereport(DEBUG1, (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index e785dd55ce56..7780ea6eae36 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1145,8 +1145,8 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams params, /* * Also compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); @@ -1971,7 +1971,7 @@ vac_truncate_clog(TransactionId frozenXID, * signaling twice? */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); - SetMultiXactIdLimit(minMulti, minmulti_datoid, false); + SetMultiXactIdLimit(minMulti, minmulti_datoid); LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1c38488f2cbb..f4830f896f33 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1936,8 +1936,8 @@ do_autovacuum(void) /* * Compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 30ad46912e18..a4060309ae0e 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -271,7 +271,7 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile->checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile->checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile->checkPointCopy.oldestXid); diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 8d5d9805279a..d5de4a7171a2 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -64,21 +64,43 @@ static ControlFileData ControlFile; /* pg_control values */ static XLogSegNo newXlogSegNo; /* new XLOG segment # */ static bool guessed = false; /* T if we had to guess at any values */ static const char *progname; -static uint32 set_xid_epoch = (uint32) -1; -static TransactionId set_oldest_xid = 0; -static TransactionId set_xid = 0; -static TransactionId set_oldest_commit_ts_xid = 0; -static TransactionId set_newest_commit_ts_xid = 0; -static Oid set_oid = 0; -static bool mxid_given = false; -static MultiXactId set_mxid = 0; -static bool mxoff_given = false; -static MultiXactOffset set_mxoff = 0; + +/* + * New values given on the command-line + */ +static bool next_xid_epoch_given = false; +static uint32 next_xid_epoch_val; + +static bool oldest_xid_given = false; +static TransactionId oldest_xid_val; + +static bool next_xid_given = false; +static TransactionId next_xid_val; + +static bool commit_ts_xids_given = false; +static TransactionId oldest_commit_ts_xid_val; +static TransactionId newest_commit_ts_xid_val; + +static bool next_oid_given = false; +static Oid next_oid_val; + +static bool mxids_given = false; +static MultiXactId next_mxid_val; +static MultiXactId oldest_mxid_val = 0; + +static bool next_mxoff_given = false; +static MultiXactOffset next_mxoff_val; + +static bool wal_segsize_given = false; +static int wal_segsize_val; + +static bool char_signedness_given = false; +static bool char_signedness_val; + + static TimeLineID minXlogTli = 0; static XLogSegNo minXlogSegNo = 0; static int WalSegSz; -static int set_wal_segsize; -static int set_char_signedness = -1; static void CheckDataVersion(void); static bool read_controlfile(void); @@ -92,6 +114,8 @@ static void KillExistingArchiveStatus(void); static void KillExistingWALSummaries(void); static void WriteEmptyXLOG(void); static void usage(void); +static uint32 strtouint32_strict(const char *restrict s, char **restrict endptr, int base); +static uint64 strtouint64_strict(const char *restrict s, char **restrict endptr, int base); int @@ -117,10 +141,8 @@ main(int argc, char *argv[]) int c; bool force = false; bool noupdate = false; - MultiXactId set_oldestmxid = 0; char *endptr; char *endptr2; - int64 tmpi64; char *DataDir = NULL; char *log_fname = NULL; int fd; @@ -162,7 +184,7 @@ main(int argc, char *argv[]) case 'e': errno = 0; - set_xid_epoch = strtoul(optarg, &endptr, 0); + next_xid_epoch_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { /*------ @@ -171,46 +193,47 @@ main(int argc, char *argv[]) pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (set_xid_epoch == -1) - pg_fatal("transaction ID epoch (-e) must not be -1"); + next_xid_epoch_given = true; break; case 'u': errno = 0; - set_oldest_xid = strtoul(optarg, &endptr, 0); + oldest_xid_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-u"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (!TransactionIdIsNormal(set_oldest_xid)) + if (!TransactionIdIsNormal(oldest_xid_val)) pg_fatal("oldest transaction ID (-u) must be greater than or equal to %u", FirstNormalTransactionId); + oldest_xid_given = true; break; case 'x': errno = 0; - set_xid = strtoul(optarg, &endptr, 0); + next_xid_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-x"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (!TransactionIdIsNormal(set_xid)) + if (!TransactionIdIsNormal(next_xid_val)) pg_fatal("transaction ID (-x) must be greater than or equal to %u", FirstNormalTransactionId); + next_xid_given = true; break; case 'c': errno = 0; - set_oldest_commit_ts_xid = strtoul(optarg, &endptr, 0); + oldest_commit_ts_xid_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',' || errno != 0) { pg_log_error("invalid argument for option %s", "-c"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - set_newest_commit_ts_xid = strtoul(endptr + 1, &endptr2, 0); + newest_commit_ts_xid_val = strtoul(endptr + 1, &endptr2, 0); if (endptr2 == endptr + 1 || *endptr2 != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-c"); @@ -218,31 +241,33 @@ main(int argc, char *argv[]) exit(1); } - if (set_oldest_commit_ts_xid < FirstNormalTransactionId && - set_oldest_commit_ts_xid != InvalidTransactionId) + if (oldest_commit_ts_xid_val < FirstNormalTransactionId && + oldest_commit_ts_xid_val != InvalidTransactionId) pg_fatal("transaction ID (-c) must be either %u or greater than or equal to %u", InvalidTransactionId, FirstNormalTransactionId); - if (set_newest_commit_ts_xid < FirstNormalTransactionId && - set_newest_commit_ts_xid != InvalidTransactionId) + if (newest_commit_ts_xid_val < FirstNormalTransactionId && + newest_commit_ts_xid_val != InvalidTransactionId) pg_fatal("transaction ID (-c) must be either %u or greater than or equal to %u", InvalidTransactionId, FirstNormalTransactionId); + commit_ts_xids_given = true; break; case 'o': errno = 0; - set_oid = strtoul(optarg, &endptr, 0); + next_oid_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-o"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (set_oid == 0) + if (next_oid_val == 0) pg_fatal("OID (-o) must not be 0"); + next_oid_given = true; break; case 'm': errno = 0; - set_mxid = strtoul(optarg, &endptr, 0); + next_mxid_val = strtouint32_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',' || errno != 0) { pg_log_error("invalid argument for option %s", "-m"); @@ -250,7 +275,7 @@ main(int argc, char *argv[]) exit(1); } - set_oldestmxid = strtoul(endptr + 1, &endptr2, 0); + oldest_mxid_val = strtouint32_strict(endptr + 1, &endptr2, 0); if (endptr2 == endptr + 1 || *endptr2 != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-m"); @@ -262,25 +287,21 @@ main(int argc, char *argv[]) * XXX It'd be nice to have more sanity checks here, e.g. so * that oldest is not wrapped around w.r.t. nextMulti. */ - if (set_oldestmxid == 0) + if (oldest_mxid_val == 0) pg_fatal("oldest multitransaction ID (-m) must not be 0"); - mxid_given = true; + mxids_given = true; break; case 'O': errno = 0; - tmpi64 = strtoi64(optarg, &endptr, 0); + next_mxoff_val = strtouint64_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - if (tmpi64 < 0 || tmpi64 > (int64) MaxMultiXactOffset) - pg_fatal("multitransaction offset (-O) must be between 0 and %u", MaxMultiXactOffset); - - set_mxoff = (MultiXactOffset) tmpi64; - mxoff_given = true; + next_mxoff_given = true; break; case 'l': @@ -304,9 +325,10 @@ main(int argc, char *argv[]) if (!option_parse_int(optarg, "--wal-segsize", 1, 1024, &wal_segsize_mb)) exit(1); - set_wal_segsize = wal_segsize_mb * 1024 * 1024; - if (!IsValidWalSegSize(set_wal_segsize)) + wal_segsize_val = wal_segsize_mb * 1024 * 1024; + if (!IsValidWalSegSize(wal_segsize_val)) pg_fatal("argument of %s must be a power of two between 1 and 1024", "--wal-segsize"); + wal_segsize_given = true; break; } @@ -315,15 +337,16 @@ main(int argc, char *argv[]) errno = 0; if (pg_strcasecmp(optarg, "signed") == 0) - set_char_signedness = 1; + char_signedness_val = true; else if (pg_strcasecmp(optarg, "unsigned") == 0) - set_char_signedness = 0; + char_signedness_val = false; else { pg_log_error("invalid argument for option %s", "--char-signedness"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } + char_signedness_given = true; break; } @@ -411,8 +434,8 @@ main(int argc, char *argv[]) /* * If no new WAL segment size was specified, use the control file value. */ - if (set_wal_segsize != 0) - WalSegSz = set_wal_segsize; + if (wal_segsize_given) + WalSegSz = wal_segsize_val; else WalSegSz = ControlFile.xlog_seg_size; @@ -435,42 +458,43 @@ main(int argc, char *argv[]) * Adjust fields if required by switches. (Do this now so that printout, * if any, includes these values.) */ - if (set_xid_epoch != -1) + if (next_xid_epoch_given) ControlFile.checkPointCopy.nextXid = - FullTransactionIdFromEpochAndXid(set_xid_epoch, + FullTransactionIdFromEpochAndXid(next_xid_epoch_val, XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); - if (set_oldest_xid != 0) + if (oldest_xid_given) { - ControlFile.checkPointCopy.oldestXid = set_oldest_xid; + ControlFile.checkPointCopy.oldestXid = oldest_xid_val; ControlFile.checkPointCopy.oldestXidDB = InvalidOid; } - if (set_xid != 0) + if (next_xid_given) ControlFile.checkPointCopy.nextXid = FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid), - set_xid); + next_xid_val); - if (set_oldest_commit_ts_xid != 0) - ControlFile.checkPointCopy.oldestCommitTsXid = set_oldest_commit_ts_xid; - if (set_newest_commit_ts_xid != 0) - ControlFile.checkPointCopy.newestCommitTsXid = set_newest_commit_ts_xid; + if (commit_ts_xids_given) + { + ControlFile.checkPointCopy.oldestCommitTsXid = oldest_commit_ts_xid_val; + ControlFile.checkPointCopy.newestCommitTsXid = newest_commit_ts_xid_val; + } - if (set_oid != 0) - ControlFile.checkPointCopy.nextOid = set_oid; + if (next_oid_given) + ControlFile.checkPointCopy.nextOid = next_oid_val; - if (mxid_given) + if (mxids_given) { - ControlFile.checkPointCopy.nextMulti = set_mxid; + ControlFile.checkPointCopy.nextMulti = next_mxid_val; - ControlFile.checkPointCopy.oldestMulti = set_oldestmxid; + ControlFile.checkPointCopy.oldestMulti = oldest_mxid_val; if (ControlFile.checkPointCopy.oldestMulti < FirstMultiXactId) ControlFile.checkPointCopy.oldestMulti += FirstMultiXactId; ControlFile.checkPointCopy.oldestMultiDB = InvalidOid; } - if (mxoff_given) - ControlFile.checkPointCopy.nextMultiOffset = set_mxoff; + if (next_mxoff_given) + ControlFile.checkPointCopy.nextMultiOffset = next_mxoff_val; if (minXlogTli > ControlFile.checkPointCopy.ThisTimeLineID) { @@ -478,11 +502,11 @@ main(int argc, char *argv[]) ControlFile.checkPointCopy.PrevTimeLineID = minXlogTli; } - if (set_wal_segsize != 0) + if (wal_segsize_given) ControlFile.xlog_seg_size = WalSegSz; - if (set_char_signedness != -1) - ControlFile.default_char_signedness = (set_char_signedness == 1); + if (char_signedness_given) + ControlFile.default_char_signedness = char_signedness_val; if (minXlogSegNo > newXlogSegNo) newXlogSegNo = minXlogSegNo; @@ -749,7 +773,7 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); @@ -813,7 +837,7 @@ PrintNewControlValues(void) newXlogSegNo, WalSegSz); printf(_("First log segment after reset: %s\n"), fname); - if (mxid_given) + if (mxids_given) { printf(_("NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); @@ -823,25 +847,25 @@ PrintNewControlValues(void) ControlFile.checkPointCopy.oldestMultiDB); } - if (mxoff_given) + if (next_mxoff_given) { - printf(_("NextMultiOffset: %u\n"), + printf(_("NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); } - if (set_oid != 0) + if (next_oid_given) { printf(_("NextOID: %u\n"), ControlFile.checkPointCopy.nextOid); } - if (set_xid != 0) + if (next_xid_given) { printf(_("NextXID: %u\n"), XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); } - if (set_oldest_xid != 0) + if (oldest_xid_given) { printf(_("OldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); @@ -849,24 +873,21 @@ PrintNewControlValues(void) ControlFile.checkPointCopy.oldestXidDB); } - if (set_xid_epoch != -1) + if (next_xid_epoch_given) { printf(_("NextXID epoch: %u\n"), EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); } - if (set_oldest_commit_ts_xid != 0) + if (commit_ts_xids_given) { printf(_("oldestCommitTsXid: %u\n"), ControlFile.checkPointCopy.oldestCommitTsXid); - } - if (set_newest_commit_ts_xid != 0) - { printf(_("newestCommitTsXid: %u\n"), ControlFile.checkPointCopy.newestCommitTsXid); } - if (set_wal_segsize != 0) + if (wal_segsize_given) { printf(_("Bytes per WAL segment: %u\n"), ControlFile.xlog_seg_size); @@ -1214,3 +1235,76 @@ usage(void) printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); } + +/* + * strtouint32_strict -- like strtoul(), but returns uint32 and doesn't accept + * negative values + */ +static uint32 +strtouint32_strict(const char *restrict s, char **restrict endptr, int base) +{ + unsigned long val; + bool is_neg; + + /* skip leading whitespace */ + while (isspace(*s)) + s++; + + /* + * Is it negative? We still call strtoul() if it was, to set 'endptr'. + * (The current callers don't care though.) + */ + is_neg = (*s == '-'); + + val = strtoul(s, endptr, base); + + /* reject if it was negative */ + if (errno == 0 && is_neg) + { + errno = ERANGE; + val = 0; + } + + /* + * reject values larger than UINT32_MAX on platforms where long is 64 bits + * wide. + */ + if (errno == 0 && val != (uint32) val) + { + errno = ERANGE; + val = UINT32_MAX; + } + + return (uint32) val; +} + +/* + * strtouint64_strict -- like strtou64(), but doesn't accept negative values + */ +static uint64 +strtouint64_strict(const char *restrict s, char **restrict endptr, int base) +{ + uint64 val; + bool is_neg; + + /* skip leading whitespace */ + while (isspace(*s)) + s++; + + /* + * Is it negative? We still call strtou64() if it was, to set 'endptr'. + * (The current callers don't care though.) + */ + is_neg = (*s == '-'); + + val = strtou64(s, endptr, base); + + /* reject if it was negative */ + if (errno == 0 && is_neg) + { + errno = ERANGE; + val = 0; + } + + return val; +} diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl index 90ecb8afe187..4ae51ee574e0 100644 --- a/src/bin/pg_resetwal/t/001_basic.pl +++ b/src/bin/pg_resetwal/t/001_basic.pl @@ -103,7 +103,7 @@ 'fails with incorrect -e option'); command_fails_like( [ 'pg_resetwal', '-e' => '-1', $node->data_dir ], - qr/must not be -1/, + qr/error: invalid argument for option -e/, 'fails with -e value -1'); # -l command_fails_like( @@ -145,7 +145,7 @@ 'fails with incorrect -O option'); command_fails_like( [ 'pg_resetwal', '-O' => '-1', $node->data_dir ], - qr/must be between 0 and 4294967295/, + qr/error: invalid argument for option -O/, 'fails with -O value -1'); # --wal-segsize command_fails_like( @@ -175,6 +175,21 @@ qr/must be greater than/, 'fails with -x value too small'); +# Check out of range values with -x. These are forbidden for all other +# 32-bit values too, but we use just -x to exercise the parsing. +command_fails_like( + [ 'pg_resetwal', '-x' => '-1', $node->data_dir ], + qr/error: invalid argument for option -x/, + 'fails with -x value -1'); +command_fails_like( + [ 'pg_resetwal', '-x' => '-100', $node->data_dir ], + qr/error: invalid argument for option -x/, + 'fails with negative -x value'); +command_fails_like( + [ 'pg_resetwal', '-x' => '10000000000', $node->data_dir ], + qr/error: invalid argument for option -x/, + 'fails with -x value too large'); + # --char-signedness command_fails_like( [ 'pg_resetwal', '--char-signedness', 'foo', $node->data_dir ], @@ -215,7 +230,7 @@ sub get_slru_files sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1])); @files = get_slru_files('pg_multixact/offsets'); -$mult = 32 * $blcksz / 4; +$mult = 32 * $blcksz / 8; # --multixact-ids argument is "new,old" push @cmd, '--multixact-ids' => sprintf("%d,%d", diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 69fcf593caec..12f747b2c596 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -18,11 +18,14 @@ OBJS = \ file.o \ function.o \ info.o \ + multixact_rewrite.o \ + multixact_read_v18.o \ option.o \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ + slru_io.o \ tablespace.o \ task.o \ util.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index ac992f0d14b1..7bd7062b62fc 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -8,11 +8,14 @@ pg_upgrade_sources = files( 'file.c', 'function.c', 'info.c', + 'multixact_rewrite.c', + 'multixact_read_v18.c', 'option.c', 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', + 'slru_io.c', 'tablespace.c', 'task.c', 'util.c', @@ -47,6 +50,7 @@ tests += { 't/004_subscription.pl', 't/005_char_signedness.pl', 't/006_transfer_modes.pl', + 't/007_multixact_conversion.pl', ], 'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow }, diff --git a/src/bin/pg_upgrade/multixact_read_v18.c b/src/bin/pg_upgrade/multixact_read_v18.c new file mode 100644 index 000000000000..fb537668a2c1 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_read_v18.c @@ -0,0 +1,337 @@ +/* + * multixact_read_v18.c + * + * Functions to read multixact SLRUs from cluster of PostgreSQL version 18 and + * older. In version 19, the multixid offsets were expanded from 32 to 64 + * bits. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_read_v18.c + */ + +#include "postgres_fe.h" + +#include "multixact_read_v18.h" +#include "pg_upgrade.h" + +/* + * NOTE: below are a bunch of definitions that are copy-pasted from + * multixact.c from version 18. It's important that this file doesn't + * #include the new definitions with same names from "multixact_internal.h"! + * + * To avoid confusion in the functions exposed outside this source file, + * though, we use OldMultiXactOffset to represent the old-style 32-bit + * multixid offsets. The new 64-bit MultiXactOffset should not be used + * anywhere in this file. + */ +#define MultiXactOffset should_not_be_used + +/* We need four bytes per offset and 8 bytes per base for each page. */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(OldMultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(OldMultiXactOffset offset) +{ + OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline int +MXOffsetToFlagsBitShift(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* + * Construct reader of old multixacts. + * + * Returns the malloced memory used by the all other calls in this module. + */ +OldMultiXactReader * +AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, + OldMultiXactOffset nextOffset) +{ + OldMultiXactReader *state = state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = nextMulti; + state->nextOffset = nextOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruRead(dir, false); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruRead(dir, false); + + return state; +} + +/* + * This is a simplified version of the GetMultiXactIdMembers() server + * function: + * + * - Only return the updating member, if any. Upgrade only cares about the + * updaters. If there is no updating member, return somewhat arbitrarily + * the first locking-only member, because we don't have any way to represent + * "no members". + * + * - Because there's no concurrent activity, We don't need to worry about + * locking and some corner cases. + * + * - Don't bail out on invalid entries. If the server crashes, it can leave + * invalid or half-written entries on disk. Such multixids won't appear + * anywhere else on disk, so the server will never try to read them. During + * upgrade, however, we scan through all multixids in order, and will + * encounter such invalid but unreferenced multixids too. + * + * Returns true on success, false if the multixact was invalid. + */ +bool +GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, + MultiXactMember *member) +{ + MultiXactId nextMXact, + nextOffset, + tmpMXact; + int64 pageno, + prev_pageno; + int entryno, + length; + char *buf; + OldMultiXactOffset *offptr, + offset; + OldMultiXactOffset nextMXOffset; + TransactionId result_xid = InvalidTransactionId; + MultiXactStatus result_status = 0; + + nextMXact = state->nextMXact; + nextOffset = state->nextOffset; + + /* + * Comment copied from GetMultiXactIdMembers in PostgreSQL v18 + * multixact.c: + * + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. + * + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + */ + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruReadSwitchPage(state->offset, pageno); + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + offset = *offptr; + + if (offset == 0) + { + /* Invalid entry */ + return false; + } + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + nextMXOffset = nextOffset; + } + else + { + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + buf = SlruReadSwitchPage(state->offset, pageno); + + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + nextMXOffset = *offptr; + } + + if (nextMXOffset == 0) + { + /* Invalid entry */ + return false; + } + length = nextMXOffset - offset; + + /* read the members */ + prev_pageno = -1; + for (int i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + MultiXactStatus status; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + buf = SlruReadSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) (buf + memberoff); + if (!TransactionIdIsValid(*xactptr)) + { + /* + * Corner case 2: we are looking at unused slot zero + */ + if (offset == 0) + continue; + + /* + * Otherwise this is an invalid entry that should not be + * referenced from anywhere in the heap. We could return 'false' + * here, but we prefer to continue reading the members and + * converting them the best we can, to preserve evidence in case + * this is corruption that should not happen. + */ + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (buf + flagsoff); + + status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + + /* + * Remember the updating XID among the members, or first locking XID + * if no updating XID. + */ + if (ISUPDATE_from_mxstatus(status)) + { + /* sanity check */ + if (ISUPDATE_from_mxstatus(result_status)) + { + /* + * We don't expect to see more than one updating member, even + * if the server had crashed. + */ + pg_fatal("multixact %u has more than one updating member", + multi); + } + result_xid = *xactptr; + result_status = status; + } + else if (!TransactionIdIsValid(result_xid)) + { + result_xid = *xactptr; + result_status = status; + } + } + + member->xid = result_xid; + member->status = result_status; + return true; +} + +/* + * Frees the malloced reader. + */ +void +FreeOldMultiXactReader(OldMultiXactReader *state) +{ + FreeSlruRead(state->offset); + FreeSlruRead(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_read_v18.h b/src/bin/pg_upgrade/multixact_read_v18.h new file mode 100644 index 000000000000..8ee82a14a46f --- /dev/null +++ b/src/bin/pg_upgrade/multixact_read_v18.h @@ -0,0 +1,37 @@ +/* + * multixact_read_v18.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_read_v18.h + */ +#ifndef MULTIXACT_READ_V18_H +#define MULTIXACT_READ_V18_H + +#include "access/multixact.h" +#include "slru_io.h" + +/* + * MultiXactOffset changed from uint32 to uint64 between versions 18 and 19. + * OldMultiXactOffset is used to represent a 32-bit offset from the old + * cluster. + */ +typedef uint32 OldMultiXactOffset; + +typedef struct OldMultiXactReader +{ + MultiXactId nextMXact; + OldMultiXactOffset nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} OldMultiXactReader; + +extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata, + MultiXactId nextMulti, + OldMultiXactOffset nextOffset); +extern bool GetOldMultiXactIdSingleMember(OldMultiXactReader *state, + MultiXactId multi, + MultiXactMember *member); +extern void FreeOldMultiXactReader(OldMultiXactReader *reader); + +#endif /* MULTIXACT_READ_V18_H */ diff --git a/src/bin/pg_upgrade/multixact_rewrite.c b/src/bin/pg_upgrade/multixact_rewrite.c new file mode 100644 index 000000000000..d483b2ff31f5 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_rewrite.c @@ -0,0 +1,195 @@ +/* + * multixact_rewrite.c + * + * Functions to convert multixact SLRUs from the pre-v19 format to the current + * format with 64-bit MultiXactOffsets. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_rewrite.c + */ + +#include "postgres_fe.h" + +#include "access/multixact_internal.h" +#include "multixact_read_v18.h" +#include "pg_upgrade.h" + +static void RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi, + MultiXactOffset offset); +static void RecordMultiXactMembers(SlruSegState *members_writer, + MultiXactOffset offset, + int nmembers, MultiXactMember *members); + +/* + * Convert pg_multixact/offset and /members from the old pre-v19 format with + * 32-bit offsets to the current format. + * + * Multixids in the range [from_multi, to_multi) are read from the old + * cluster, and written in the new format. An important edge case is that if + * from_multi == to_multi, this initializes the new pg_multixact files in the + * new format without trying to open any old files. (We rely on that when + * upgrading from PostgreSQL version 9.2 or below.) + * + * Returns the new nextOffset value; the caller should set it in the new + * control file. The new members always start from offset 1, regardless of + * the offset range used in the old cluster. + */ +MultiXactOffset +rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi) +{ + MultiXactId oldest_multi, + next_multi; + MultiXactOffset next_offset; + SlruSegState *offsets_writer; + SlruSegState *members_writer; + char dir[MAXPGPATH] = {0}; + bool prev_multixid_valid = false; + + /* + * The range of valid multi XIDs is unchanged by the conversion (they are + * referenced from the heap tables), but the members SLRU is rewritten to + * start from offset 1. + */ + oldest_multi = from_multi; + next_multi = to_multi; + next_offset = 1; + + /* Prepare to write the new SLRU files */ + pg_sprintf(dir, "%s/pg_multixact/offsets", new_cluster.pgdata); + offsets_writer = AllocSlruWrite(dir, false); + SlruWriteSwitchPage(offsets_writer, MultiXactIdToOffsetPage(from_multi)); + + pg_sprintf(dir, "%s/pg_multixact/members", new_cluster.pgdata); + members_writer = AllocSlruWrite(dir, true /* use long segment names */ ); + SlruWriteSwitchPage(members_writer, MXOffsetToMemberPage(next_offset)); + + /* + * Convert old multixids, if needed, by reading them one-by-one from the + * old cluster. + */ + if (to_multi != from_multi) + { + OldMultiXactReader *old_reader; + + old_reader = AllocOldMultiXactRead(old_cluster.pgdata, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_nxtmxoff); + + for (MultiXactId multi = oldest_multi; multi != next_multi;) + { + MultiXactMember member; + bool multixid_valid; + + /* + * Read this multixid's members. + * + * Locking-only XIDs that may be part of multi-xids don't matter + * after upgrade, as there can be no transactions running across + * upgrade. So as a small optimization, we only read one member + * from each multixid: the one updating one, or if there was no + * update, arbitrarily the first locking xid. + */ + multixid_valid = GetOldMultiXactIdSingleMember(old_reader, multi, &member); + + /* + * Write the new offset to pg_multixact/offsets. + * + * If the old multixid was invalid, we still need to write this + * offset if the *previous* multixid was valid. That's because + * the when reading a multixids, the number of members is + * calculated from the difference between the current and the next + * multixid's offsets. + */ + RecordMultiXactOffset(offsets_writer, multi, + (multixid_valid || prev_multixid_valid) ? next_offset : 0); + + if (multixid_valid) + { + RecordMultiXactMembers(members_writer, next_offset, 1, &member); + next_offset += 1; + } + + /* Advance to next multixid, handling wraparound */ + multi++; + if (multi < FirstMultiXactId) + multi = FirstMultiXactId; + prev_multixid_valid = multixid_valid; + } + + FreeOldMultiXactReader(old_reader); + } + + /* write the final 'next' offset to the last SLRU page */ + RecordMultiXactOffset(offsets_writer, next_multi, + prev_multixid_valid ? next_offset : 0); + + /* Release resources */ + FreeSlruWrite(offsets_writer); + FreeSlruWrite(members_writer); + + return next_offset; +} + + +/* + * Write one offset to the offset SLRU + */ +static void +RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi, + MultiXactOffset offset) +{ + int64 pageno; + int entryno; + char *buf; + MultiXactOffset *offptr; + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruWriteSwitchPage(offsets_writer, pageno); + offptr = (MultiXactOffset *) buf; + offptr[entryno] = offset; +} + +/* + * Write the members for one multixid in the members SLRU + * + * (Currently, this is only ever called with nmembers == 1) + */ +static void +RecordMultiXactMembers(SlruSegState *members_writer, + MultiXactOffset offset, + int nmembers, MultiXactMember *members) +{ + for (int i = 0; i < nmembers; i++, offset++) + { + int64 pageno; + char *buf; + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + buf = SlruWriteSwitchPage(members_writer, pageno); + + memberptr = (TransactionId *) (buf + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) (buf + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + } +} diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 490e98fa26f2..b3405c22135f 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -43,6 +43,7 @@ #include +#include "access/multixact.h" #include "catalog/pg_class_d.h" #include "common/file_perm.h" #include "common/logging.h" @@ -807,15 +808,15 @@ copy_xact_xlog_xid(void) new_cluster.pgdata); check_ok(); - /* - * If the old server is before the MULTIXACT_FORMATCHANGE_CAT_VER change - * (see pg_upgrade.h) and the new server is after, then we don't copy - * pg_multixact files, but we need to reset pg_control so that the new - * server doesn't attempt to read multis older than the cutoff value. - */ - if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && - new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + /* Copy or convert pg_multixact files */ + Assert(new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER); + Assert(new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER); + if (old_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) { + /* No change in multixact format, just copy the files */ + MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff; + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); copy_subdir_files("pg_multixact/members", "pg_multixact/members"); @@ -826,38 +827,64 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", - new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, - old_cluster.controldata.chkpnt_nxtmulti, + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", + new_cluster.bindir, new_nxtmxoff, new_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); check_ok(); } - else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + else { + /* Conversion is needed */ + MultiXactId nxtmulti; + MultiXactId oldstMulti; + MultiXactOffset nxtmxoff; + /* - * Remove offsets/0000 file created by initdb that no longer matches - * the new multi-xid value. "members" starts at zero so no need to - * remove it. + * Determine the range of multixacts to convert. */ - remove_new_subdir("pg_multixact/offsets", false); + nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + oldstMulti = old_cluster.controldata.chkpnt_oldstMulti; + else + { + /* + * In PostgreSQL 9.2 and below, multitransactions were only used + * for row locking, and as such don't need to be preserved during + * upgrade. In that case, we utilize convert_multixacts() just to + * initialize new, empty files in the new format. + * + * It's important that the oldest multi is set to the latest value + * used by the old system, so that multixact.c returns the empty + * set for multis that might be present on disk. + */ + oldstMulti = nxtmulti; + } + /* handle wraparound */ + if (nxtmulti < FirstMultiXactId) + nxtmulti = FirstMultiXactId; + if (oldstMulti < FirstMultiXactId) + oldstMulti = FirstMultiXactId; - prep_status("Setting oldest multixact ID in new cluster"); + /* + * Remove the files created by initdb in the new cluster. + * convert_multixacts() will create new ones. + */ + remove_new_subdir("pg_multixact/members", false); + remove_new_subdir("pg_multixact/offsets", false); /* - * We don't preserve files in this case, but it's important that the - * oldest multi is set to the latest value used by the old system, so - * that multixact.c returns the empty set for multis that might be - * present on disk. We set next multi to the value following that; it - * might end up wrapped around (i.e. 0) if the old cluster had - * next=MaxMultiXactId, but multixact.c can cope with that just fine. + * Create new pg_multixact files, converting old ones if needed. */ + prep_status("Converting pg_multixact files"); + nxtmxoff = rewrite_multixacts(oldstMulti, nxtmulti); + check_ok(); + + prep_status("Setting next multixact ID and offset for new cluster"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -m %u,%u \"%s\"", + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmulti + 1, - old_cluster.controldata.chkpnt_nxtmulti, + nxtmxoff, nxtmulti, oldstMulti, new_cluster.pgdata); check_ok(); } diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e86336f4be95..48f15dff5e06 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,13 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * MultiXactOffset was changed from 32-bit to 64-bit in version 19, at this + * catalog version. pg_multixact files need to be converted when upgrading + * across this version. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 @@ -235,7 +242,7 @@ typedef struct uint32 chkpnt_nxtepoch; uint32 chkpnt_nxtoid; uint32 chkpnt_nxtmulti; - uint32 chkpnt_nxtmxoff; + uint64 chkpnt_nxtmxoff; uint32 chkpnt_oldstMulti; uint32 chkpnt_oldstxid; uint32 align; @@ -499,6 +506,9 @@ void old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, void report_extension_updates(ClusterInfo *cluster); +/* multixact_rewrite.c */ +MultiXactOffset rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi); + /* parallel.c */ void parallel_exec_prog(const char *log_file, const char *opt_log_file, const char *fmt,...) pg_attribute_printf(3, 4); diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c new file mode 100644 index 000000000000..720445289b9f --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.c @@ -0,0 +1,258 @@ +/* + * slru_io.c + * + * Routines for reading and writing SLRU files during upgrade. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.c + */ + +#include "postgres_fe.h" + +#include + +#include "common/fe_memutils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "port/pg_iovec.h" +#include "pg_upgrade.h" +#include "slru_io.h" + +static SlruSegState *AllocSlruSegState(const char *dir); +static char *SlruFileName(SlruSegState *state, int64 segno); +static void SlruFlush(SlruSegState *state); + +static SlruSegState * +AllocSlruSegState(const char *dir) +{ + SlruSegState *state = pg_malloc(sizeof(*state)); + + state->dir = pstrdup(dir); + state->fn = NULL; + state->fd = -1; + state->segno = -1; + state->pageno = 0; + + return state; +} + +/* similar to the backend function with the same name */ +static char * +SlruFileName(SlruSegState *state, int64 segno) +{ + if (state->long_segment_names) + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + return psprintf("%s/%015" PRIX64, state->dir, segno); + } + else + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF)); + return psprintf("%s/%04X", state->dir, (unsigned int) segno); + } +} + +/* + * Create slru reader for dir. + * + * Returns the malloced memory used by the all other read calls in this module. + */ +SlruSegState * +AllocSlruRead(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = false; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Open given page for reading. + * + * Reading can be done in random order. + */ +char * +SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + ssize_t bytes_read; + off_t offset; + + Assert(!state->writing); /* read only mode */ + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + /* Open new segment */ + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", state->fn); + } + state->segno = segno; + + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + bytes_read = 0; + while (bytes_read < BLCKSZ) + { + ssize_t rc; + + rc = pg_pread(state->fd, + &state->buf.data + bytes_read, + BLCKSZ - bytes_read, + offset + bytes_read); + if (rc < 0) + { + if (errno == EINTR) + continue; + pg_fatal("could not read file \"%s\": %m", state->fn); + } + if (rc == 0) + { + /* unexpected EOF */ + pg_log(PG_WARNING, "unexpected EOF reading file \"%s\" at offset %zd, reading as zeros", state->fn, + offset + bytes_read); + memset(&state->buf.data + bytes_read, 0, BLCKSZ - bytes_read); + break; + } + bytes_read += rc; + } + state->pageno = pageno; + + return state->buf.data; +} + +/* + * Frees the malloced reader. + */ +void +FreeSlruRead(SlruSegState *state) +{ + Assert(!state->writing); /* read only mode */ + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} + +/* + * Create slru writer for dir. + * + * Returns the malloced memory used by the all other write calls in this module. + */ +SlruSegState * +AllocSlruWrite(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = true; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Open the given page for writing. + * + * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that + * each segment is written in full before moving on to next one. This + * limitation would be easy to lift if needed, but it fits the usage pattern of + * current callers. + */ +char * +SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + off_t offset; + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + SlruFlush(state); + memset(state->buf.data, 0, BLCKSZ); + + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + /* Create the segment */ + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_fatal("could not create file \"%s\": %m", state->fn); + } + + state->segno = segno; + + if (offset > 0) + { + if (pg_pwrite_zeros(state->fd, offset, 0) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + } + + state->pageno = pageno; + + return state->buf.data; +} + +static void +SlruFlush(SlruSegState *state) +{ + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset; + + if (state->segno == -1) + return; + + offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); +} + +/* + * Frees the malloced writer. + */ +void +FreeSlruWrite(SlruSegState *state) +{ + Assert(state->writing); + + SlruFlush(state); + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h new file mode 100644 index 000000000000..5c80a679b4d1 --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.h @@ -0,0 +1,52 @@ +/* + * slru_io.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.h + */ + +#ifndef SLRU_IO_H +#define SLRU_IO_H + +/* + * State for reading or writing an SLRU, with a one page buffer. + */ +typedef struct SlruSegState +{ + bool writing; + bool long_segment_names; + + char *dir; + char *fn; + int fd; + int64 segno; + uint64 pageno; + + PGAlignedBlock buf; +} SlruSegState; + +extern SlruSegState *AllocSlruRead(const char *dir, bool long_segment_names); +extern char *SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruRead(SlruSegState *state); + +static inline char * +SlruReadSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruReadSwitchPageSlow(state, pageno); +} + +extern SlruSegState *AllocSlruWrite(const char *dir, bool long_segment_names); +extern char *SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruWrite(SlruSegState *state); + +static inline char * +SlruWriteSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruWriteSwitchPageSlow(state, pageno); +} + +#endif /* SLRU_IO_H */ diff --git a/src/bin/pg_upgrade/t/007_multixact_conversion.pl b/src/bin/pg_upgrade/t/007_multixact_conversion.pl new file mode 100644 index 000000000000..f84bd5668bfa --- /dev/null +++ b/src/bin/pg_upgrade/t/007_multixact_conversion.pl @@ -0,0 +1,339 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Version 19 expanded MultiXactOffset from 32 to 64 bits. Upgrading +# across that requires rewriting the SLRU files to the new format. +# This file contains tests for the conversion. +# +# To run, set 'oldinstall' ENV variable to point to a pre-v19 +# installation. If it's not set, or if it points to a v19 or above +# installation, this still performs a very basic test, upgrading a +# cluster with some multixacts. It's not very interesting, however, +# because there's no conversion involved in that case. + +use strict; +use warnings FATAL => 'all'; + +use Math::BigInt; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Temp dir for a dumps. +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +# A workload that consumes multixids. The purpose of this is to +# generate some multixids in the old cluster, so that we can test +# upgrading them. The workload is a mix of KEY SHARE locking queries +# and UPDATEs, and commits and aborts, to generate a mix of multixids +# with different statuses. It consumes around 3000 multixids with +# 30000 members. That's enough to span more than one multixids +# 'offsets' page, and more than one 'members' segment. +# +# The workload leaves behind a table called 'mxofftest' containing a +# small number of rows referencing some of the generated multixids. +# +# Because this function is used to generate test data on the old +# installation, it needs to work with older PostgreSQL server +# versions. +# +# The first argument is the cluster to connect to, the second argument +# is a cluster using the new version. We need the 'psql' binary from +# the new version, the new cluster is otherwise unused. (We need to +# use the new 'psql' because some of the more advanced background psql +# perl module features depend on a fairly recent psql version.) +sub mxact_workload +{ + my $node = shift; # Cluster to connect to + my $binnode = shift; # Use the psql binary from this cluster + + my $connstr = $node->connstr('postgres'); + + $node->start; + $node->safe_psql('postgres', qq[ + CREATE TABLE mxofftest (id INT PRIMARY KEY, n_updated INT) + WITH (AUTOVACUUM_ENABLED=FALSE); + INSERT INTO mxofftest SELECT G, 0 FROM GENERATE_SERIES(1, 50) G; + ]); + + my $nclients = 20; + my $update_every = 13; + my $abort_every = 11; + my @connections = (); + + # Silence the logging of the statements we run to avoid + # unnecessarily bloating the test logs. This runs before the + # upgrade we're testing, so the details should not be very + # interesting for debugging. But if needed, you can make it more + # verbose by setting this. + my $verbose = 0; + + # Open multiple connections to the database. Start a transaction + # in each connection. + for (0 .. $nclients) + { + # Use the psql binary from the new installation. The + # BackgroundPsql functionality doesn't work with older psql + # versions. + my $conn = $binnode->background_psql('', + connstr => $node->connstr('postgres')); + + $conn->query_safe("SET log_statement=none", verbose => $verbose) unless $verbose; + $conn->query_safe("SET enable_seqscan=off", verbose => $verbose); + $conn->query_safe("BEGIN", verbose => $verbose); + + push(@connections, $conn); + } + + # Run queries using cycling through the connections in a + # round-robin fashion. We keep a transaction open in each + # connection at all times, and lock/update the rows. With 10 + # connections, each SELECT FOR KEY SHARE query generates a new + # multixid, containing the 10 XIDs of all the transactions running + # at the time. + for (my $i = 0; $i < 3000; $i++) + { + my $conn = $connections[ $i % $nclients ]; + + my $sql; + if ($i % $abort_every == 0) + { + $sql = "ABORT; "; + } + else + { + $sql = "COMMIT; "; + } + $sql .= "BEGIN; "; + + if ($i % $update_every == 0) + { + $sql .= qq[ + UPDATE mxofftest SET n_updated = n_updated + 1 WHERE id = ${i} % 50; + ]; + } + else + { + my $threshold = int($i / 3000 * 50); + $sql .= qq[ + select count(*) from ( + SELECT * FROM mxofftest WHERE id >= $threshold FOR KEY SHARE + ) as x + ]; + } + $conn->query_safe($sql, verbose => $verbose); + } + + for my $conn (@connections) + { + $conn->quit(); + } + + $node->stop; + return; +} + +# Return contents of the 'mxofftest' table, created by mxact_workload +sub get_test_table_contents +{ + my ($node, $file_prefix) = @_; + + my $contents = $node->safe_psql('postgres', + "SELECT ctid, xmin, xmax, * FROM mxofftest"); + + my $dumpfile = $tempdir . '/' . $file_prefix . '.sql'; + open(my $dh, '>', $dumpfile) + || die "could not open $dumpfile for writing $!"; + print $dh $contents; + close($dh); + + return $dumpfile; +} + +# Read NextMultiOffset from the control file +# +# Note: This is used on both the old and the new installation, so the +# command arguments and the output parsing used here must work with +# all PostgreSQL versions supported by the test. +sub read_next_mxoff +{ + my $node = shift; + + my $pg_controldata_path = $node->installed_command('pg_controldata'); + my ($stdout, $stderr) = + run_command([ $pg_controldata_path, $node->data_dir ]); + $stdout =~ /^Latest checkpoint's NextMultiOffset:\s*(.*)$/m + or die "could not read NextMultiOffset from pg_controldata"; + return $1; +} + +# Reset a cluster's oldest multixact-offset to given offset. +# +# Note: This is used on both the old and the new installation, so the +# command arguments and the output parsing used here must work with +# all PostgreSQL versions supported by the test. +sub reset_mxoff_pre_v19 +{ + my $node = shift; + my $offset = shift; + + my $pg_resetwal_path = $node->installed_command('pg_resetwal'); + # Get block size + my ($out, $err) = + run_command([ $pg_resetwal_path, '--dry-run', $node->data_dir ]); + $out =~ /^Database block size: *(\d+)$/m or die; + my $blcksz = $1; + # SLRU_PAGES_PER_SEGMENT is always 32 on pre-19 version + my $slru_pages_per_segment = 32; + + # Verify that no multixids are currently in use. Resetting would + # destroy them. (A freshly initialized cluster has no multixids.) + $out =~ /^Latest checkpoint's NextMultiXactId: *(\d+)$/m or die; + my $next_mxid = $1; + $out =~ /^Latest checkpoint's oldestMultiXid: *(\d+)$/m or die; + my $oldest_mxid = $1; + die "cluster has some multixids in use" unless $next_mxid == $oldest_mxid; + + # Reset to new offset using pg_resetwal + my @cmd = ( + $pg_resetwal_path, + '--pgdata' => $node->data_dir, + '--multixact-offset' => $offset); + command_ok(\@cmd, 'set oldest multixact-offset'); + + # pg_resetwal just updates the control file. The cluster will + # refuse to start up, if the SLRU segment corresponding to the + # offset does not exist. Create a dummy segment that covers the + # given offset, filled with zeros. But first remove any old + # segments. + unlink glob $node->data_dir . "/pg_multixact/members/*"; + + my $mult = 32 * int($blcksz / 20) * 4; + my $segname = sprintf "%04X", $offset / $mult; + + my $path = $node->data_dir . "/pg_multixact/members/" . $segname; + + my $null_block = "\x00" x $blcksz; + open(my $dh, '>', $path) + || die "could not open $path for writing $!"; + for (0 .. $slru_pages_per_segment) + { + print $dh $null_block; + } + close($dh); +} + +# Main test workhorse routine. +# Dump data on old version, run pg_upgrade, compare data after upgrade. +sub upgrade_and_compare +{ + my $tag = shift; + my $oldnode = shift; + my $newnode = shift; + + command_ok( + [ + 'pg_upgrade', '--no-sync', + '--old-datadir' => $oldnode->data_dir, + '--new-datadir' => $newnode->data_dir, + '--old-bindir' => $oldnode->config_data('--bindir'), + '--new-bindir' => $newnode->config_data('--bindir'), + '--socketdir' => $newnode->host, + '--old-port' => $oldnode->port, + '--new-port' => $newnode->port, + ], + 'run of pg_upgrade for new instance'); + + # Note: we do this *after* running pg_upgrade, to ensure that we + # don't set all the hint bits before upgrade by doing the SELECT + # on the table. + $oldnode->start; + my $old_dump = get_test_table_contents($oldnode, "oldnode_${tag}_dump"); + $oldnode->stop; + + $newnode->start; + my $new_dump = get_test_table_contents($newnode, "newnode_${tag}_dump"); + $newnode->stop; + + compare_files($old_dump, $new_dump, + 'test table contents from original and upgraded databases match'); +} + +my $old_version; + +# Basic scenario: Create a cluster using old installation, run +# multixid-creating workload on it, then upgrade. +# +# This works even even if the old and new version is the same, +# although it's not very interesting as the conversion routines only +# run when upgrading from a pre-v19 cluster. +{ + my $tag = 'basic'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + $old_version = $old->pg_version; + note "old installation is version $old_version\n"; + + # Run the workload + my $start_mxoff = read_next_mxoff($old); + mxact_workload($old, $new); + my $finish_mxoff = read_next_mxoff($old); + + $new->init; + upgrade_and_compare($tag, $old, $new); + + my $new_next_mxoff = read_next_mxoff($new); + + note ">>> case #${tag}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n" + . " newnode mxoff ${new_next_mxoff}\n"; +} + +# Wraparound scenario: This is the same as the basic scenario, but the +# old cluster goes through mxoffset wraparound. +# +# This requires the old installation to be version 19 of older, +# because the hacks we use to reset the old cluster to a state just +# before the wraparound rely on the pre-v19 file format. In version +# 19, offsets no longer wrap around anyway. +SKIP: +{ + skip + "skipping mxoffset conversion tests because upgrading from the old version does not require conversion" + if ($old_version >= '19devel'); + + my $tag = 'wraparound'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + # Reset the NextMultiOffset value in the old cluster to just before 32-bit wraparound. + reset_mxoff_pre_v19($old, 0xFFFFEC77); + + # Run the workload. This crosses the wraparound. + my $start_mxoff = read_next_mxoff($old); + mxact_workload($old, $new); + my $finish_mxoff = read_next_mxoff($old); + + # Verify that wraparound happened. + cmp_ok($finish_mxoff, '<', $start_mxoff, + "mxoff wrapped around in old cluster"); + + $new->init; + upgrade_and_compare($tag, $old, $new); + + my $new_next_mxoff = read_next_mxoff($new); + + note ">>> case #${tag}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n" + . " newnode mxoff ${new_next_mxoff}\n"; +} + +done_testing(); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 82e4bb90dd58..6433fe163641 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -28,8 +28,6 @@ #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) - /* * Possible multixact lock modes ("status"). The first four modes are for * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the @@ -111,7 +109,7 @@ extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly); extern void MultiXactIdSetOldestMember(void); extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly); -extern bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, +extern void GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset); extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); @@ -131,8 +129,7 @@ extern void BootStrapMultiXact(void); extern void StartupMultiXact(void); extern void TrimMultiXact(void); extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, - Oid oldest_datoid, - bool is_startup); + Oid oldest_datoid); extern void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, diff --git a/src/include/access/multixact_internal.h b/src/include/access/multixact_internal.h new file mode 100644 index 000000000000..c4dd1aa044f7 --- /dev/null +++ b/src/include/access/multixact_internal.h @@ -0,0 +1,119 @@ +/* + * multixact_internal.h + * + * PostgreSQL multi-transaction-log manager internal declarations + * + * These functions and definitions are for dealing with pg_multixact pages. + * They are internal to multixact.c, but they are exported here to allow + * pg_upgrade to write pg_multixact files directly. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/multixact_internal.h + */ +#ifndef MULTIXACT_INTERNAL_H +#define MULTIXACT_INTERNAL_H + +#include "access/multixact.h" + + +/* + * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * used everywhere else in Postgres. + */ + +/* We need 8 bytes per offset */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int64 +MultiXactIdToOffsetSegment(MultiXactId multi) +{ + return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(MultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +static inline int64 +MXOffsetToMemberSegment(MultiXactOffset offset) +{ + return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +static inline int +MXOffsetToFlagsBitShift(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +#endif /* MULTIXACT_INTERNAL_H */ diff --git a/src/include/c.h b/src/include/c.h index ccd2b654d459..62cbf7a2eec2 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -669,7 +669,7 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; typedef uint32 CommandId; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index d13ed62af46f..b0162c2bf63b 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,7 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202512051 +// FIXME: bump it +#define CATALOG_VERSION_NO 999999999 #endif diff --git a/src/test/modules/test_slru/t/002_multixact_wraparound.pl b/src/test/modules/test_slru/t/002_multixact_wraparound.pl index 169333fc5647..272d8e6fb087 100644 --- a/src/test/modules/test_slru/t/002_multixact_wraparound.pl +++ b/src/test/modules/test_slru/t/002_multixact_wraparound.pl @@ -37,7 +37,7 @@ # initialize the 'offsets' SLRU file containing the new next multixid # with zeros -my $multixact_offsets_per_page = $blcksz / 4; # sizeof(MultiXactOffset) == 4 +my $multixact_offsets_per_page = $blcksz / 8; # sizeof(MultiXactOffset) == 8 my $segno = int(0xFFFFFFF8 / $multixact_offsets_per_page / $slru_pages_per_segment); my $slru_file = sprintf('%s/pg_multixact/offsets/%04X', $node_pgdata, $segno); diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm index 60bbd5dd445b..9825aaa9bb42 100644 --- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm +++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm @@ -230,18 +230,23 @@ Executes a query in the current session and returns the output in scalar context and (output, error) in list context where error is 1 in case there was output generated on stderr when executing the query. +By default, the query and its results are printed to the test output. This +can be disabled by passing the keyword parameter verbose => false. + =cut sub query { - my ($self, $query) = @_; + my ($self, $query, %params) = @_; my $ret; my $output; my $query_cnt = $self->{query_cnt}++; + $params{verbose} = 1 unless defined $params{verbose}; + local $Test::Builder::Level = $Test::Builder::Level + 1; - note "issuing query $query_cnt via background psql: $query"; + note "issuing query $query_cnt via background psql: $query" unless !$params{verbose}; $self->{timeout}->start() if (defined($self->{query_timer_restart})); @@ -280,7 +285,7 @@ sub query explain { stdout => $self->{stdout}, stderr => $self->{stderr}, - }; + } unless !$params{verbose}; # Remove banner from stdout and stderr, our caller doesn't care. The # first newline is optional, as there would not be one if consuming an @@ -308,9 +313,9 @@ Query failure is determined by it producing output on stderr. sub query_safe { - my ($self, $query) = @_; + my ($self, $query, %params) = @_; - my $ret = $self->query($query); + my $ret = $self->query($query, %params); if ($self->{stderr} ne "") { diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 747528c4af1f..295988b8b877 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -1793,13 +1793,20 @@ sub _get_env return (%inst_env); } -# Private routine to get an installation path qualified command. -# -# IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests -# which use nodes spanning more than one postgres installation path need to -# avoid confusing which installation's binaries get run. Setting $ENV{PATH} is -# insufficient, as IPC::Run does not check to see if the path has changed since -# caching a command. +=pod + +=item $node->installed_command(cmd) + +Get an installation path qualified command. + +IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests +which use nodes spanning more than one postgres installation path need to +avoid confusing which installation's binaries get run. Setting $ENV{PATH} is +insufficient, as IPC::Run does not check to see if the path has changed since +caching a command. + +=cut + sub installed_command { my ($self, $cmd) = @_; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index c1ad80a418d0..f69e68e6dbd2 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1731,6 +1731,7 @@ MultiXactMember MultiXactOffset MultiXactStateData MultiXactStatus +MultiXactWriter MultirangeIOData MultirangeParseState MultirangeType @@ -1816,6 +1817,7 @@ OffsetVarNodes_context Oid OidOptions OkeysState +OldMultiXactReader OldToNewMapping OldToNewMappingData OnCommitAction @@ -2814,6 +2816,7 @@ SlruCtlData SlruErrorCause SlruPageStatus SlruScanCallback +SlruSegState SlruShared SlruSharedData SlruWriteAll