diff options
Diffstat (limited to 'src')
33 files changed, 754 insertions, 324 deletions
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock index 843c2e58f92..16f7d78b7d2 100644 --- a/src/backend/access/heap/README.tuplock +++ b/src/backend/access/heap/README.tuplock @@ -199,3 +199,35 @@ under a reader holding a pin. A reader of a heap_fetch() result tuple may witness a torn read. Current inplace-updated fields are aligned and are no wider than four bytes, and current readers don't need consistency across fields. Hence, they get by with just fetching each field once. + +During logical decoding, caches reflect an inplace update no later than the +next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command. +Tuples of its cmin are then visible to decoding, as are inplace updates of any +lower LSN. Inplace updates of a higher LSN may also be visible, even if those +updates would have been invisible to a non-historic snapshot matching +decoding's historic snapshot. (In other words, decoding may see inplace +updates that were not visible to a similar snapshot taken during original +transaction processing.) That's a consequence of inplace update violating +MVCC: there are no snapshot-specific versions of inplace-updated values. This +all makes it hard to reason about inplace-updated column reads during logical +decoding, but the behavior does suffice for relhasindex. A relhasindex=t in +CREATE INDEX becomes visible no later than the new pg_index row. While it may +be visible earlier, that's harmless. Finding zero indexes despite +relhasindex=t is normal in more cases than this, e.g. after DROP INDEX. +Example of a case that meaningfully reacts to the inplace inval: + +CREATE TABLE cat (c int) WITH (user_catalog_table = true); +CREATE TABLE normal (d int); +... +CREATE INDEX ON cat (c)\; INSERT INTO normal VALUES (1); + +If the output plugin reads "cat" during decoding of the INSERT, it's fair to +want that read to see relhasindex=t and use the new index. + +An alternative would be to have decoding of XLOG_HEAP_INPLACE immediately +execute its invals. That would behave more like invals during original +transaction processing. It would remove the decoding-specific delay in e.g. a +decoding plugin witnessing a relfrozenxid change. However, a good use case +for that is unlikely, since the plugin would still witness relfrozenxid +changes prematurely. Hence, inplace update takes the trivial approach of +delegating to XLOG_XACT_INVALIDATIONS. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 225f9829f22..6daf4a87dec 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6116,7 +6116,7 @@ heap_finish_speculative(Relation relation, const ItemPointerData *tid) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); @@ -6124,10 +6124,10 @@ heap_finish_speculative(Relation relation, const ItemPointerData *tid) page = BufferGetPage(buffer); offnum = ItemPointerGetOffsetNumber(tid); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(ERROR, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(ERROR, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -6349,10 +6349,13 @@ heap_abort_speculative(Relation relation, const ItemPointerData *tid) * Since this is intended for system catalogs and SERIALIZABLE doesn't cover * DDL, this doesn't guarantee any particular predicate locking. * - * One could modify this to return true for tuples with delete in progress, - * All inplace updaters take a lock that conflicts with DROP. If explicit - * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an - * update. + * heap_delete() is a rarer source of blocking transactions (xwait). We'll + * wait for such a transaction just like for the normal heap_update() case. + * Normal concurrent DROP commands won't cause that, because all inplace + * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE + * FROM pg_class" can cause it. By waiting, if the concurrent transaction + * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller + * can find the successor tuple. * * Readers of inplace-updated fields expect changes to those fields are * durable. For example, vac_truncate_clog() reads datfrozenxid from @@ -6393,15 +6396,17 @@ heap_inplace_lock(Relation relation, Assert(BufferIsValid(buffer)); /* - * Construct shared cache inval if necessary. Because we pass a tuple - * version without our own inplace changes or inplace changes other - * sessions complete while we wait for locks, inplace update mustn't - * change catcache lookup keys. But we aren't bothering with index - * updates either, so that's true a fortiori. After LockBuffer(), it - * would be too late, because this might reach a - * CatalogCacheInitializeCache() that locks "buffer". + * Register shared cache invals if necessary. Other sessions may finish + * inplace updates of this tuple between this step and LockTuple(). Since + * inplace updates don't change cache keys, that's harmless. + * + * While it's tempting to register invals only after confirming we can + * return true, the following obstacle precludes reordering steps that + * way. Registering invals might reach a CatalogCacheInitializeCache() + * that locks "buffer". That would hang indefinitely if running after our + * own LockBuffer(). Hence, we must register invals before LockBuffer(). */ - CacheInvalidateHeapTupleInplace(relation, oldtup_ptr, NULL); + CacheInvalidateHeapTupleInplace(relation, oldtup_ptr); LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -6639,10 +6644,6 @@ heap_inplace_update_and_unlock(Relation relation, /* * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we * do this before UnlockTuple(). - * - * If we're mutating a tuple visible only to this transaction, there's an - * equivalent transactional inval from the action that created the tuple, - * and this inval is superfluous. */ AtInplace_Inval(); @@ -6653,10 +6654,10 @@ heap_inplace_update_and_unlock(Relation relation, AcceptInvalidationMessages(); /* local processing of just-sent inval */ /* - * Queue a transactional inval. The immediate invalidation we just sent - * is the only one known to be necessary. To reduce risk from the - * transition to immediate invalidation, continue sending a transactional - * invalidation like we've long done. Third-party code might rely on it. + * Queue a transactional inval, for logical decoding and for third-party + * code that might have been relying on it since long before inplace + * update adopted immediate invalidation. See README.tuplock section + * "Reading inplace-updated columns" for logical decoding details. */ if (!IsBootstrapProcessingMode()) CacheInvalidateHeapTuple(relation, tuple, NULL); diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index a09fb4b803a..1823feff298 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -422,7 +422,7 @@ heap_xlog_delete(XLogReaderState *record) xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record); Buffer buffer; Page page; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; BlockNumber blkno; RelFileLocator target_locator; @@ -451,10 +451,10 @@ heap_xlog_delete(XLogReaderState *record) { page = BufferGetPage(buffer); - if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) - lp = PageGetItemId(page, xlrec->offnum); - - if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + if (xlrec->offnum < 1 || xlrec->offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, xlrec->offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -817,7 +817,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) nbuffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleData oldtup; HeapTupleHeader htup; uint16 prefixlen = 0, @@ -881,10 +881,10 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) { page = BufferGetPage(obuffer); offnum = xlrec->old_offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1087,7 +1087,7 @@ heap_xlog_confirm(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) @@ -1095,10 +1095,10 @@ heap_xlog_confirm(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1126,7 +1126,7 @@ heap_xlog_lock(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; /* @@ -1155,10 +1155,10 @@ heap_xlog_lock(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1200,7 +1200,7 @@ heap_xlog_lock_updated(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; xlrec = (xl_heap_lock_updated *) XLogRecGetData(record); @@ -1231,10 +1231,10 @@ heap_xlog_lock_updated(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -1263,7 +1263,7 @@ heap_xlog_inplace(XLogReaderState *record) Buffer buffer; Page page; OffsetNumber offnum; - ItemId lp = NULL; + ItemId lp; HeapTupleHeader htup; uint32 oldlen; Size newlen; @@ -1275,10 +1275,10 @@ heap_xlog_inplace(XLogReaderState *record) page = BufferGetPage(buffer); offnum = xlrec->offnum; - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) elog(PANIC, "invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 707c25289cd..b7f10a1aed0 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -781,10 +781,11 @@ systable_endscan_ordered(SysScanDesc sysscan) * systable_inplace_update_begin --- update a row "in place" (overwrite it) * * Overwriting violates both MVCC and transactional safety, so the uses of - * this function in Postgres are extremely limited. Nonetheless we find some - * places to use it. See README.tuplock section "Locking to write - * inplace-updated tables" and later sections for expectations of readers and - * writers of a table that gets inplace updates. Standard flow: + * this function in Postgres are extremely limited. This makes no effort to + * support updating cache key columns or other indexed columns. Nonetheless + * we find some places to use it. See README.tuplock section "Locking to + * write inplace-updated tables" and later sections for expectations of + * readers and writers of a table that gets inplace updates. Standard flow: * * ... [any slow preparation not requiring oldtup] ... * systable_inplace_update_begin([...], &tup, &inplace_state); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index aec71093661..7a416d60cea 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -2148,6 +2148,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) else offnum = P_FIRSTDATAKEY(opaque); + if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page)) + elog(PANIC, "offnum out of range"); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); blkno = BTreeTupleGetDownLink(itup); diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index f4fab7edfee..34956a5a663 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1262,6 +1262,11 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("MultiXact %u has invalid next offset", multi))); + if (nextMXOffset == offset) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u with offset (%" PRIu64 ") has zero members", + multi, offset))); if (nextMXOffset < offset) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6a5640df51a..430a38b1a21 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7001,6 +7001,10 @@ CreateCheckPoint(int flags) */ SyncPreCheckpoint(); + /* Run these points outside the critical section. */ + INJECTION_POINT("create-checkpoint-initial", NULL); + INJECTION_POINT_LOAD("create-checkpoint-run"); + /* * Use a critical section to force system panic if we have trouble. */ @@ -7151,6 +7155,8 @@ CreateCheckPoint(int flags) if (log_checkpoints) LogCheckpointStart(flags, false); + INJECTION_POINT_CACHED("create-checkpoint-run", NULL); + /* Update the process title */ update_checkpoint_display(flags, false, false); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index ae2398d6975..38b594d2170 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -805,6 +805,16 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + + /* Make sure that REDO location exists. */ + if (checkPoint.redo < CheckPointLoc) + { + XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo); + if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) + ereport(FATAL, + errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc))); + } } if (ArchiveRecoveryRequested) diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index cc03f0706e9..5e15cb1825e 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -521,18 +521,9 @@ heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) /* * Inplace updates are only ever performed on catalog tuples and - * can, per definition, not change tuple visibility. Inplace - * updates don't affect storage or interpretation of table rows, - * so they don't affect logicalrep_write_tuple() outcomes. Hence, - * we don't process invalidations from the original operation. If - * inplace updates did affect those things, invalidations wouldn't - * make it work, since there are no snapshot-specific versions of - * inplace-updated values. Since we also don't decode catalog - * tuples, we're not interested in the record's contents. - * - * WAL contains likely-unnecessary commit-time invals from the - * CacheInvalidateHeapTuple() call in - * heap_inplace_update_and_unlock(). Excess invalidation is safe. + * can, per definition, not change tuple visibility. Since we + * also don't decode catalog tuples, we're not interested in the + * record's contents. */ break; diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index 66240318e83..072f9399969 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -180,11 +180,13 @@ init_dsm_registry(void) * Initialize or attach a named DSM segment. * * This routine returns the address of the segment. init_callback is called to - * initialize the segment when it is first created. + * initialize the segment when it is first created. 'arg' is passed through to + * the initialization callback function. */ void * GetNamedDSMSegment(const char *name, size_t size, - void (*init_callback) (void *ptr), bool *found) + void (*init_callback) (void *ptr, void *arg), + bool *found, void *arg) { DSMRegistryEntry *entry; MemoryContext oldcontext; @@ -235,7 +237,7 @@ GetNamedDSMSegment(const char *name, size_t size, seg = dsm_create(size, 0); if (init_callback) - (*init_callback) (dsm_segment_address(seg)); + (*init_callback) (dsm_segment_address(seg), arg); dsm_pin_segment(seg); dsm_pin_mapping(seg); diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index 6e7b914c563..f8524548e46 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -15,18 +15,19 @@ #include "postgres.h" #include "access/detoast.h" -#include "catalog/pg_collation_d.h" -#include "catalog/pg_type_d.h" +#include "common/hashfn.h" #include "common/int.h" #include "fmgr.h" +#include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bitutils.h" +#include "port/pg_bswap.h" #include "utils/builtins.h" #include "utils/bytea.h" #include "utils/fmgrprotos.h" +#include "utils/guc.h" #include "utils/memutils.h" #include "utils/sortsupport.h" -#include "utils/varlena.h" #include "varatt.h" /* GUC variable */ @@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L, bool length_not_specified); static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); +typedef struct +{ + bool abbreviate; /* Should we abbreviate keys? */ + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ + double prop_card; /* Required cardinality proportion */ +} ByteaSortSupport; + +/* Static function declarations for sort support */ +static int byteafastcmp(Datum x, Datum y, SortSupport ssup); +static Datum bytea_abbrev_convert(Datum original, SortSupport ssup); +static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup); + /* * bytea_catenate * Guts of byteacat(), broken out so it can be used by other functions @@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); } +/* + * sortsupport comparison func + */ +static int +byteafastcmp(Datum x, Datum y, SortSupport ssup) +{ + bytea *arg1 = DatumGetByteaPP(x); + bytea *arg2 = DatumGetByteaPP(y); + char *a1p, + *a2p; + int len1, + len2, + result; + + a1p = VARDATA_ANY(arg1); + a2p = VARDATA_ANY(arg2); + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + result = memcmp(a1p, a2p, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + + /* We can't afford to leak memory here. */ + if (PointerGetDatum(arg1) != x) + pfree(arg1); + if (PointerGetDatum(arg2) != y) + pfree(arg2); + + return result; +} + +/* + * Conversion routine for sortsupport. Converts original to abbreviated key + * representation. Our encoding strategy is simple -- pack the first 8 bytes + * of the bytea data into a Datum (on little-endian machines, the bytes are + * stored in reverse order), and treat it as an unsigned integer. + */ +static Datum +bytea_abbrev_convert(Datum original, SortSupport ssup) +{ + const size_t max_prefix_bytes = sizeof(Datum); + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + bytea *authoritative = DatumGetByteaPP(original); + char *authoritative_data = VARDATA_ANY(authoritative); + Datum res; + char *pres; + int len; + uint32 hash; + + pres = (char *) &res; + + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, max_prefix_bytes); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* + * Short byteas will have terminating NUL bytes in the abbreviated datum. + * Abbreviated comparison need not make a distinction between these NUL + * bytes, and NUL bytes representing actual NULs in the authoritative + * representation. + * + * Hopefully a comparison at or past one abbreviated key's terminating NUL + * byte will resolve the comparison without consulting the authoritative + * representation; specifically, some later non-NUL byte in the longer + * bytea can resolve the comparison against a subsequent terminating NUL + * in the shorter bytea. There will usually be what is effectively a + * "length-wise" resolution there and then. + * + * If that doesn't work out -- if all bytes in the longer bytea positioned + * at or past the offset of the smaller bytea (first) terminating NUL are + * actually representative of NUL bytes in the authoritative binary bytea + * (perhaps with some *terminating* NUL bytes towards the end of the + * longer bytea iff it happens to still be small) -- then an authoritative + * tie-breaker will happen, and do the right thing: explicitly consider + * bytea length. + */ + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string abbreviations for no saving in + * full memcmp()-based comparisons. These statistics are used by + * bytea_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, + Min(len, PG_CACHE_LINE_SIZE))); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&bss->full_card, hash); + + /* Hash abbreviated key */ + { + uint32 tmp; + + tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32); + hash = DatumGetUInt32(hash_uint32(tmp)); + } + + addHyperLogLog(&bss->abbr_card, hash); + + /* + * Byteswap on little-endian machines. + * + * This is needed so that ssup_datum_unsigned_cmp() works correctly on all + * platforms. + */ + res = DatumBigEndianToNative(res); + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + * + * This is based on varstr_abbrev_abort(), but some comments have been elided + * for brevity. See there for more details. + */ +static bool +bytea_abbrev_abort(int memtupcount, SortSupport ssup) +{ + ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra; + double abbrev_distinct, + key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 100) + return false; + + abbrev_distinct = estimateHyperLogLog(&bss->abbr_card); + key_distinct = estimateHyperLogLog(&bss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While + * NULLs are generally disregarded, if only NULL values were seen so far, + * that might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct < 1.0) + abbrev_distinct = 1.0; + + if (key_distinct < 1.0) + key_distinct = 1.0; + + if (trace_sort) + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f " + "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, + bss->prop_card); + } + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct original keys, continue with abbreviation. + */ + if (abbrev_distinct > key_distinct * bss->prop_card) + { + /* + * Decay required cardinality aggressively after 10,000 tuples. + */ + if (memtupcount > 10000) + bss->prop_card *= 0.65; + + return false; + } + + /* + * Abort abbreviation strategy. + */ + if (trace_sort) + elog(LOG, "bytea_abbrev: aborted abbreviation at %d " + "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, bss->prop_card); + + return true; +} + Datum bytea_sortsupport(PG_FUNCTION_ARGS) { @@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS) oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); + ssup->comparator = byteafastcmp; + + /* + * Set up abbreviation support if requested. + */ + if (ssup->abbreviate) + { + ByteaSortSupport *bss; + + bss = palloc_object(ByteaSortSupport); + bss->abbreviate = true; + bss->prop_card = 0.20; + initHyperLogLog(&bss->abbr_card, 10); + initHyperLogLog(&bss->full_card, 10); + + ssup->ssup_extra = bss; + ssup->abbrev_full_comparator = ssup->comparator; + ssup->comparator = ssup_datum_unsigned_cmp; + ssup->abbrev_converter = bytea_abbrev_convert; + ssup->abbrev_abort = bytea_abbrev_abort; + } MemoryContextSwitchTo(oldcontext); diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index dca1d9be035..b1b0192aa46 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen, static Selectivity regex_selectivity(const char *patt, int pattlen, bool case_insensitive, int fixed_prefix_len); -static int pattern_char_isalpha(char c, bool is_multibyte, - pg_locale_t locale); static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation); static Datum string_to_datum(const char *str, Oid datatype); @@ -986,8 +984,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS) */ static Pattern_Prefix_Status -like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, - Const **prefix_const, Selectivity *rest_selec) +like_fixed_prefix(Const *patt_const, Const **prefix_const, + Selectivity *rest_selec) { char *match; char *patt; @@ -995,34 +993,10 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Oid typeid = patt_const->consttype; int pos, match_pos; - bool is_multibyte = (pg_database_encoding_max_length() > 1); - pg_locale_t locale = 0; /* the right-hand const is type text or bytea */ Assert(typeid == BYTEAOID || typeid == TEXTOID); - if (case_insensitive) - { - if (typeid == BYTEAOID) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("case insensitive matching not supported on type bytea"))); - - if (!OidIsValid(collation)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for ILIKE"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } - - locale = pg_newlocale_from_collation(collation); - } - if (typeid != BYTEAOID) { patt = TextDatumGetCString(patt_const->constvalue); @@ -1055,11 +1029,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, break; } - /* Stop if case-varying character (it's sort of a wildcard) */ - if (case_insensitive && - pattern_char_isalpha(patt[pos], is_multibyte, locale)) - break; - match[match_pos++] = patt[pos]; } @@ -1071,8 +1040,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, *prefix_const = string_to_bytea_const(match, match_pos); if (rest_selec != NULL) - *rest_selec = like_selectivity(&patt[pos], pattlen - pos, - case_insensitive); + *rest_selec = like_selectivity(&patt[pos], pattlen - pos, false); pfree(patt); pfree(match); @@ -1087,6 +1055,112 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, return Pattern_Prefix_None; } +/* + * Case-insensitive variant of like_fixed_prefix(). Multibyte and + * locale-aware for detecting cased characters. + */ +static Pattern_Prefix_Status +like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const, + Selectivity *rest_selec) +{ + text *val = DatumGetTextPP(patt_const->constvalue); + Oid typeid = patt_const->consttype; + int nbytes = VARSIZE_ANY_EXHDR(val); + int wpos; + pg_wchar *wpatt; + int wpattlen; + pg_wchar *wmatch; + int wmatch_pos = 0; + char *match; + int match_mblen; + pg_locale_t locale = 0; + + /* the right-hand const is type text or bytea */ + Assert(typeid == BYTEAOID || typeid == TEXTOID); + + if (typeid == BYTEAOID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("case insensitive matching not supported on type bytea"))); + + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for ILIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + locale = pg_newlocale_from_collation(collation); + + wpatt = palloc((nbytes + 1) * sizeof(pg_wchar)); + wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes); + + wmatch = palloc((nbytes + 1) * sizeof(pg_wchar)); + for (wpos = 0; wpos < wpattlen; wpos++) + { + /* % and _ are wildcard characters in LIKE */ + if (wpatt[wpos] == '%' || + wpatt[wpos] == '_') + break; + + /* Backslash escapes the next character */ + if (wpatt[wpos] == '\\') + { + wpos++; + if (wpos >= wpattlen) + break; + } + + /* + * For ILIKE, stop if it's a case-varying character (it's sort of a + * wildcard). + */ + if (pg_iswcased(wpatt[wpos], locale)) + break; + + wmatch[wmatch_pos++] = wpatt[wpos]; + } + + wmatch[wmatch_pos] = '\0'; + + match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1); + match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos); + match[match_mblen] = '\0'; + pfree(wmatch); + + *prefix_const = string_to_const(match, TEXTOID); + pfree(match); + + if (rest_selec != NULL) + { + int wrestlen = wpattlen - wmatch_pos; + char *rest; + int rest_mblen; + + rest = palloc(pg_database_encoding_max_length() * wrestlen + 1); + rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen); + + *rest_selec = like_selectivity(rest, rest_mblen, true); + pfree(rest); + } + + pfree(wpatt); + + /* in LIKE, an empty pattern is an exact match! */ + if (wpos == wpattlen) + return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ + + if (wmatch_pos > 0) + return Pattern_Prefix_Partial; + + return Pattern_Prefix_None; +} + static Pattern_Prefix_Status regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, Const **prefix_const, Selectivity *rest_selec) @@ -1164,12 +1238,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, switch (ptype) { case Pattern_Type_Like: - result = like_fixed_prefix(patt, false, collation, - prefix, rest_selec); + result = like_fixed_prefix(patt, prefix, rest_selec); break; case Pattern_Type_Like_IC: - result = like_fixed_prefix(patt, true, collation, - prefix, rest_selec); + result = like_fixed_prefix_ci(patt, collation, prefix, + rest_selec); break; case Pattern_Type_Regex: result = regex_fixed_prefix(patt, false, collation, @@ -1481,24 +1554,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive, return sel; } -/* - * Check whether char is a letter (and, hence, subject to case-folding) - * - * In multibyte character sets or with ICU, we can't use isalpha, and it does - * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha. - * Instead, just assume any non-ASCII char is potentially case-varying, and - * hard-wire knowledge of which ASCII chars are letters. - */ -static int -pattern_char_isalpha(char c, bool is_multibyte, - pg_locale_t locale) -{ - if (locale->ctype_is_c) - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else - return char_is_cased(c, locale); -} - /* * For bytea, the increment function need only increment the current byte diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 70933ee3843..8a3796aa5d0 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1626,21 +1626,6 @@ pg_towlower(pg_wchar wc, pg_locale_t locale) } /* - * char_is_cased() - * - * Fuzzy test of whether the given char is case-varying or not. The argument - * is a single byte, so in a multibyte encoding, just assume any non-ASCII - * char is case-varying. - */ -bool -char_is_cased(char ch, pg_locale_t locale) -{ - if (locale->ctype == NULL) - return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); - return locale->ctype->char_is_cased(ch, locale); -} - -/* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. */ diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 0d4c754a267..0c2920112bb 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -191,13 +191,6 @@ wc_iscased_builtin(pg_wchar wc, pg_locale_t locale) return pg_u_prop_cased(to_char32(wc)); } -static bool -char_is_cased_builtin(char ch, pg_locale_t locale) -{ - return IS_HIGHBIT_SET(ch) || - (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); -} - static pg_wchar wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) { @@ -225,7 +218,6 @@ static const struct ctype_methods ctype_methods_builtin = { .wc_ispunct = wc_ispunct_builtin, .wc_isspace = wc_isspace_builtin, .wc_isxdigit = wc_isxdigit_builtin, - .char_is_cased = char_is_cased_builtin, .wc_iscased = wc_iscased_builtin, .wc_tolower = wc_tolower_builtin, .wc_toupper = wc_toupper_builtin, diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index e8820666b2d..18d026deda8 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -121,13 +121,6 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); -static bool -char_is_cased_icu(char ch, pg_locale_t locale) -{ - return IS_HIGHBIT_SET(ch) || - (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); -} - /* * XXX: many of the functions below rely on casts directly from pg_wchar to * UChar32, which is correct for the UTF-8 encoding, but not in general. @@ -244,7 +237,6 @@ static const struct ctype_methods ctype_methods_icu = { .wc_ispunct = wc_ispunct_icu, .wc_isspace = wc_isspace_icu, .wc_isxdigit = wc_isxdigit_icu, - .char_is_cased = char_is_cased_icu, .wc_iscased = wc_iscased_icu, .wc_toupper = toupper_icu, .wc_tolower = tolower_icu, diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 3d841f818a5..3baa5816b5f 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -262,17 +262,6 @@ wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale) iswlower_l((wint_t) wc, locale->lt); } -static bool -char_is_cased_libc(char ch, pg_locale_t locale) -{ - bool is_multibyte = pg_database_encoding_max_length() > 1; - - if (is_multibyte && IS_HIGHBIT_SET(ch)) - return true; - else - return isalpha_l((unsigned char) ch, locale->lt); -} - static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale) { @@ -345,7 +334,6 @@ static const struct ctype_methods ctype_methods_libc_sb = { .wc_ispunct = wc_ispunct_libc_sb, .wc_isspace = wc_isspace_libc_sb, .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, .wc_iscased = wc_iscased_libc_sb, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, @@ -371,7 +359,6 @@ static const struct ctype_methods ctype_methods_libc_other_mb = { .wc_ispunct = wc_ispunct_libc_sb, .wc_isspace = wc_isspace_libc_sb, .wc_isxdigit = wc_isxdigit_libc_sb, - .char_is_cased = char_is_cased_libc, .wc_iscased = wc_iscased_libc_sb, .wc_toupper = toupper_libc_sb, .wc_tolower = tolower_libc_sb, @@ -393,7 +380,6 @@ static const struct ctype_methods ctype_methods_libc_utf8 = { .wc_ispunct = wc_ispunct_libc_mb, .wc_isspace = wc_isspace_libc_mb, .wc_isxdigit = wc_isxdigit_libc_mb, - .char_is_cased = char_is_cased_libc, .wc_iscased = wc_iscased_libc_mb, .wc_toupper = toupper_libc_mb, .wc_tolower = tolower_libc_mb, diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index baa5b44ea8d..8adeb8dadc6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -92,7 +92,7 @@ typedef struct int last_returned; /* Last comparison result (cache) */ bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ bool collate_c; - Oid typid; /* Actual datatype (text/bpchar/bytea/name) */ + Oid typid; /* Actual datatype (text/bpchar/name) */ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ double prop_card; /* Required cardinality proportion */ @@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS) * Includes locale support, and support for BpChar semantics (i.e. removing * trailing spaces before comparison). * - * Relies on the assumption that text, VarChar, BpChar, and bytea all have the - * same representation. Callers that always use the C collation (e.g. - * non-collatable type callers like bytea) may have NUL bytes in their strings; - * this will not work with any other collation, though. + * Relies on the assumption that text, VarChar, and BpChar all have the + * same representation. */ void varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) @@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) * representation. Our encoding strategy is simple -- pack the first 8 bytes * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are * stored in reverse order), and treat it as an unsigned integer. When the "C" - * locale is used, or in case of bytea, just memcpy() from original instead. + * locale is used just memcpy() from original instead. */ static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) @@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) /* * If we're using the C collation, use memcpy(), rather than strxfrm(), to - * abbreviate keys. The full comparator for the C locale is always - * memcmp(). It would be incorrect to allow bytea callers (callers that - * always force the C collation -- bytea isn't a collatable type, but this - * approach is convenient) to use strxfrm(). This is because bytea - * strings may contain NUL bytes. Besides, this should be faster, too. - * - * More generally, it's okay that bytea callers can have NUL bytes in - * strings because abbreviated cmp need not make a distinction between - * terminating NUL bytes, and NUL bytes representing actual NULs in the - * authoritative representation. Hopefully a comparison at or past one - * abbreviated key's terminating NUL byte will resolve the comparison - * without consulting the authoritative representation; specifically, some - * later non-NUL byte in the longer string can resolve the comparison - * against a subsequent terminating NUL in the shorter string. There will - * usually be what is effectively a "length-wise" resolution there and - * then. - * - * If that doesn't work out -- if all bytes in the longer string - * positioned at or past the offset of the smaller string's (first) - * terminating NUL are actually representative of NUL bytes in the - * authoritative binary string (perhaps with some *terminating* NUL bytes - * towards the end of the longer string iff it happens to still be small) - * -- then an authoritative tie-breaker will happen, and do the right - * thing: explicitly consider string length. + * abbreviate keys. The full comparator for the C locale is also + * memcmp(). This should be faster than strxfrm(). */ if (sss->collate_c) memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); @@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * strxfrm() blob is itself NUL terminated, leaving no danger of * misinterpreting any NUL bytes not intended to be interpreted as * logically representing termination. - * - * (Actually, even if there were NUL bytes in the blob it would be - * okay. See remarks on bytea case above.) */ memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } @@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup) * NULLs are generally disregarded, if only NULL values were seen so far, * that might misrepresent costs if we failed to clamp. */ - if (abbrev_distinct <= 1.0) + if (abbrev_distinct < 1.0) abbrev_distinct = 1.0; - if (key_distinct <= 1.0) + if (key_distinct < 1.0) key_distinct = 1.0; /* diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 9d16ca10ae1..8f7a56d0f2c 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1583,13 +1583,17 @@ CacheInvalidateHeapTuple(Relation relation, * implied. * * Like CacheInvalidateHeapTuple(), but for inplace updates. + * + * Just before and just after the inplace update, the tuple's cache keys must + * match those in key_equivalent_tuple. Cache keys consist of catcache lookup + * key columns and columns referencing pg_class.oid values, + * e.g. pg_constraint.conrelid, which would trigger relcache inval. */ void CacheInvalidateHeapTupleInplace(Relation relation, - HeapTuple tuple, - HeapTuple newtuple) + HeapTuple key_equivalent_tuple) { - CacheInvalidateHeapTupleCommon(relation, tuple, newtuple, + CacheInvalidateHeapTupleCommon(relation, key_equivalent_tuple, NULL, PrepareInplaceInvalidationState); } diff --git a/src/bin/pg_upgrade/multixact_read_v18.c b/src/bin/pg_upgrade/multixact_read_v18.c index f7ed97f702b..dc4ed1b3ae6 100644 --- a/src/bin/pg_upgrade/multixact_read_v18.c +++ b/src/bin/pg_upgrade/multixact_read_v18.c @@ -146,11 +146,14 @@ AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, * - Because there's no concurrent activity, we don't need to worry about * locking and some corner cases. * - * - Don't bail out on invalid entries. If the server crashes, it can leave - * invalid or half-written entries on disk. Such multixids won't appear - * anywhere else on disk, so the server will never try to read them. During - * upgrade, however, we scan through all multixids in order, and will - * encounter such invalid but unreferenced multixids too. + * - Don't bail out on invalid entries that could've been left behind after a + * server crash. Such multixids won't appear anywhere else on disk, so the + * server will never try to read them. During upgrade, however, we scan + * through all multixids in order, and will encounter such invalid but + * unreferenced multixids too. We try to distinguish between entries that + * are invalid because of missed disk writes, like entries with zeros in + * offsets or members, and entries that look corrupt in other ways that + * should not happen even on a server crash. * * Returns true on success, false if the multixact was invalid. */ @@ -211,7 +214,7 @@ GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, if (offset == 0) { - /* Invalid entry */ + /* Invalid entry. These can be left behind on a server crash. */ return false; } @@ -247,11 +250,29 @@ GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, if (nextMXOffset == 0) { - /* Invalid entry */ + /* Invalid entry. These can be left behind on a server crash. */ return false; } length = nextMXOffset - offset; + if (length < 0) + { + /* + * This entry is corrupt. We should not see these even after a server + * crash. + */ + pg_fatal("multixact %u has an invalid length (%d)", multi, length); + } + if (length == 0) + { + /* + * Invalid entry. The server never writes multixids with zero + * members, but it's not clear if a server crash or using pg_resetwal + * could leave them behind. Seems best to accept them. + */ + return false; + } + /* read the members */ prev_pageno = -1; for (int i = 0; i < length; i++, offset++) @@ -284,10 +305,11 @@ GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, /* * Otherwise this is an invalid entry that should not be - * referenced from anywhere in the heap. We could return 'false' - * here, but we prefer to continue reading the members and - * converting them the best we can, to preserve evidence in case - * this is corruption that should not happen. + * referenced from anywhere in the heap. These can be left behind + * on a server crash. We could return 'false' here, but we prefer + * to continue reading the members and converting them the best we + * can, to preserve evidence in case this is corruption that + * should not have happened. */ } diff --git a/src/bin/pg_upgrade/t/007_multixact_conversion.pl b/src/bin/pg_upgrade/t/007_multixact_conversion.pl index 443b93c7545..14db40e45ed 100644 --- a/src/bin/pg_upgrade/t/007_multixact_conversion.pl +++ b/src/bin/pg_upgrade/t/007_multixact_conversion.pl @@ -26,8 +26,9 @@ my $tempdir = PostgreSQL::Test::Utils::tempdir; # upgrading them. The workload is a mix of KEY SHARE locking queries # and UPDATEs, and commits and aborts, to generate a mix of multixids # with different statuses. It consumes around 3000 multixids with -# 30000 members. That's enough to span more than one multixids -# 'offsets' page, and more than one 'members' segment. +# 60000 members in total. That's enough to span more than one +# multixids 'offsets' page, and more than one 'members' segment with +# the default block size. # # The workload leaves behind a table called 'mxofftest' containing a # small number of rows referencing some of the generated multixids. @@ -68,6 +69,12 @@ sub mxact_workload # verbose by setting this. my $verbose = 0; + # Bump the timeout on the connections to avoid false negatives on + # slow test systems. The timeout covers the whole duration that + # the connections are open rather than the individual queries. + my $connection_timeout_secs = + 4 * $PostgreSQL::Test::Utils::timeout_default; + # Open multiple connections to the database. Start a transaction # in each connection. for (0 .. $nclients) @@ -75,8 +82,10 @@ sub mxact_workload # Use the psql binary from the new installation. The # BackgroundPsql functionality doesn't work with older psql # versions. - my $conn = $binnode->background_psql('', - connstr => $node->connstr('postgres')); + my $conn = $binnode->background_psql( + '', + connstr => $node->connstr('postgres'), + timeout => $connection_timeout_secs); $conn->query_safe("SET log_statement=none", verbose => $verbose) unless $verbose; @@ -88,12 +97,14 @@ sub mxact_workload # Run queries using cycling through the connections in a # round-robin fashion. We keep a transaction open in each - # connection at all times, and lock/update the rows. With 10 + # connection at all times, and lock/update the rows. With 20 # connections, each SELECT FOR KEY SHARE query generates a new - # multixid, containing the 10 XIDs of all the transactions running - # at the time. + # multixid, containing the XIDs of all the transactions running at + # the time, ie. around 20 XIDs. for (my $i = 0; $i < 3000; $i++) { + note "generating multixids $i / 3000\n" if ($i % 100 == 0); + my $conn = $connections[ $i % $nclients ]; my $sql = ($i % $abort_every == 0) ? "ABORT" : "COMMIT"; diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 4d84bdc81e4..5df67ceea87 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -676,8 +676,6 @@ extern int pg_valid_server_encoding(const char *name); extern bool is_encoding_supported_by_icu(int encoding); extern const char *get_encoding_name_for_icu(int encoding); -extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string); -extern char32_t utf8_to_unicode(const unsigned char *c); extern bool pg_utf8_islegal(const unsigned char *source, int length); extern int pg_utf_mblen(const unsigned char *s); extern int pg_mule_mblen(const unsigned char *s); diff --git a/src/include/storage/dsm_registry.h b/src/include/storage/dsm_registry.h index 4871ed509eb..c7c6827afec 100644 --- a/src/include/storage/dsm_registry.h +++ b/src/include/storage/dsm_registry.h @@ -16,8 +16,8 @@ #include "lib/dshash.h" extern void *GetNamedDSMSegment(const char *name, size_t size, - void (*init_callback) (void *ptr), - bool *found); + void (*init_callback) (void *ptr, void *arg), + bool *found, void *arg); extern dsa_area *GetNamedDSA(const char *name, bool *found); extern dshash_table *GetNamedDSHash(const char *name, const dshash_parameters *params, diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index af466252578..345733dd038 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -61,8 +61,7 @@ extern void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple); extern void CacheInvalidateHeapTupleInplace(Relation relation, - HeapTuple tuple, - HeapTuple newtuple); + HeapTuple key_equivalent_tuple); extern void CacheInvalidateCatalog(Oid catalogId); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 832007385d8..a533463d5b7 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -125,9 +125,6 @@ struct ctype_methods bool (*wc_iscased) (pg_wchar wc, pg_locale_t locale); pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale); pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale); - - /* required */ - bool (*char_is_cased) (char ch, pg_locale_t locale); }; /* @@ -178,7 +175,6 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); -extern bool char_is_cased(char ch, pg_locale_t locale); extern size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); diff --git a/src/interfaces/libpq-oauth/oauth-curl.c b/src/interfaces/libpq-oauth/oauth-curl.c index bd0a656a166..92a7f1cd383 100644 --- a/src/interfaces/libpq-oauth/oauth-curl.c +++ b/src/interfaces/libpq-oauth/oauth-curl.c @@ -247,7 +247,8 @@ struct async_ctx * our entry point, errors have three parts: * * - errctx: an optional static string, describing the global operation - * currently in progress. It'll be translated for you. + * currently in progress. Should be translated with + * libpq_gettext(). * * - errbuf: contains the actual error message. Generally speaking, use * actx_error[_str] to manipulate this. This must be filled @@ -367,13 +368,16 @@ pg_fe_cleanup_oauth_flow(PGconn *conn) /* * Macros for manipulating actx->errbuf. actx_error() translates and formats a - * string for you; actx_error_str() appends a string directly without - * translation. + * string for you, actx_error_internal() is the untranslated equivalent, and + * actx_error_str() appends a string directly (also without translation). */ #define actx_error(ACTX, FMT, ...) \ appendPQExpBuffer(&(ACTX)->errbuf, libpq_gettext(FMT), ##__VA_ARGS__) +#define actx_error_internal(ACTX, FMT, ...) \ + appendPQExpBuffer(&(ACTX)->errbuf, FMT, ##__VA_ARGS__) + #define actx_error_str(ACTX, S) \ appendPQExpBufferStr(&(ACTX)->errbuf, S) @@ -461,6 +465,9 @@ struct oauth_parse #define oauth_parse_set_error(ctx, fmt, ...) \ appendPQExpBuffer((ctx)->errbuf, libpq_gettext(fmt), ##__VA_ARGS__) +#define oauth_parse_set_error_internal(ctx, fmt, ...) \ + appendPQExpBuffer((ctx)->errbuf, fmt, ##__VA_ARGS__) + static void report_type_mismatch(struct oauth_parse *ctx) { @@ -475,20 +482,20 @@ report_type_mismatch(struct oauth_parse *ctx) switch (ctx->active->type) { case JSON_TOKEN_STRING: - msgfmt = "field \"%s\" must be a string"; + msgfmt = gettext_noop("field \"%s\" must be a string"); break; case JSON_TOKEN_NUMBER: - msgfmt = "field \"%s\" must be a number"; + msgfmt = gettext_noop("field \"%s\" must be a number"); break; case JSON_TOKEN_ARRAY_START: - msgfmt = "field \"%s\" must be an array of strings"; + msgfmt = gettext_noop("field \"%s\" must be an array of strings"); break; default: Assert(false); - msgfmt = "field \"%s\" has unexpected type"; + msgfmt = gettext_noop("field \"%s\" has unexpected type"); } oauth_parse_set_error(ctx, msgfmt, ctx->active->name); @@ -536,9 +543,9 @@ oauth_json_object_field_start(void *state, char *name, bool isnull) if (ctx->active) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: started field '%s' before field '%s' was finished", - name, ctx->active->name); + oauth_parse_set_error_internal(ctx, + "internal error: started field \"%s\" before field \"%s\" was finished", + name, ctx->active->name); return JSON_SEM_ACTION_FAILED; } @@ -588,9 +595,9 @@ oauth_json_object_end(void *state) if (!ctx->nested && ctx->active) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: field '%s' still active at end of object", - ctx->active->name); + oauth_parse_set_error_internal(ctx, + "internal error: field \"%s\" still active at end of object", + ctx->active->name); return JSON_SEM_ACTION_FAILED; } @@ -644,9 +651,9 @@ oauth_json_array_end(void *state) if (ctx->nested != 2 || ctx->active->type != JSON_TOKEN_ARRAY_START) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: found unexpected array end while parsing field '%s'", - ctx->active->name); + oauth_parse_set_error_internal(ctx, + "internal error: found unexpected array end while parsing field \"%s\"", + ctx->active->name); return JSON_SEM_ACTION_FAILED; } @@ -699,9 +706,9 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (ctx->nested != 1) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: scalar target found at nesting level %d", - ctx->nested); + oauth_parse_set_error_internal(ctx, + "internal error: scalar target found at nesting level %d", + ctx->nested); return JSON_SEM_ACTION_FAILED; } @@ -709,9 +716,9 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (*field->scalar) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: scalar field '%s' would be assigned twice", - ctx->active->name); + oauth_parse_set_error_internal(ctx, + "internal error: scalar field \"%s\" would be assigned twice", + ctx->active->name); return JSON_SEM_ACTION_FAILED; } @@ -731,9 +738,9 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (ctx->nested != 2) { Assert(false); - oauth_parse_set_error(ctx, - "internal error: array member found at nesting level %d", - ctx->nested); + oauth_parse_set_error_internal(ctx, + "internal error: array member found at nesting level %d", + ctx->nested); return JSON_SEM_ACTION_FAILED; } @@ -1087,7 +1094,7 @@ parse_token_error(struct async_ctx *actx, struct token_error *err) * override the errctx if parsing explicitly fails. */ if (!result) - actx->errctx = "failed to parse token error response"; + actx->errctx = libpq_gettext("failed to parse token error response"); return result; } @@ -1115,8 +1122,8 @@ record_token_error(struct async_ctx *actx, const struct token_error *err) if (response_code == 401) { actx_error(actx, actx->used_basic_auth - ? "provider rejected the oauth_client_secret" - : "provider requires client authentication, and no oauth_client_secret is set"); + ? gettext_noop("provider rejected the oauth_client_secret") + : gettext_noop("provider requires client authentication, and no oauth_client_secret is set")); actx_error_str(actx, " "); } } @@ -1179,20 +1186,20 @@ setup_multiplexer(struct async_ctx *actx) actx->mux = epoll_create1(EPOLL_CLOEXEC); if (actx->mux < 0) { - actx_error(actx, "failed to create epoll set: %m"); + actx_error_internal(actx, "failed to create epoll set: %m"); return false; } actx->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); if (actx->timerfd < 0) { - actx_error(actx, "failed to create timerfd: %m"); + actx_error_internal(actx, "failed to create timerfd: %m"); return false; } if (epoll_ctl(actx->mux, EPOLL_CTL_ADD, actx->timerfd, &ev) < 0) { - actx_error(actx, "failed to add timerfd to epoll set: %m"); + actx_error_internal(actx, "failed to add timerfd to epoll set: %m"); return false; } @@ -1201,8 +1208,7 @@ setup_multiplexer(struct async_ctx *actx) actx->mux = kqueue(); if (actx->mux < 0) { - /*- translator: the term "kqueue" (kernel queue) should not be translated */ - actx_error(actx, "failed to create kqueue: %m"); + actx_error_internal(actx, "failed to create kqueue: %m"); return false; } @@ -1215,7 +1221,7 @@ setup_multiplexer(struct async_ctx *actx) actx->timerfd = kqueue(); if (actx->timerfd < 0) { - actx_error(actx, "failed to create timer kqueue: %m"); + actx_error_internal(actx, "failed to create timer kqueue: %m"); return false; } @@ -1259,7 +1265,7 @@ register_socket(CURL *curl, curl_socket_t socket, int what, void *ctx, break; default: - actx_error(actx, "unknown libcurl socket operation: %d", what); + actx_error_internal(actx, "unknown libcurl socket operation: %d", what); return -1; } @@ -1276,15 +1282,15 @@ register_socket(CURL *curl, curl_socket_t socket, int what, void *ctx, switch (op) { case EPOLL_CTL_ADD: - actx_error(actx, "could not add to epoll set: %m"); + actx_error_internal(actx, "could not add to epoll set: %m"); break; case EPOLL_CTL_DEL: - actx_error(actx, "could not delete from epoll set: %m"); + actx_error_internal(actx, "could not delete from epoll set: %m"); break; default: - actx_error(actx, "could not update epoll set: %m"); + actx_error_internal(actx, "could not update epoll set: %m"); } return -1; @@ -1334,7 +1340,7 @@ register_socket(CURL *curl, curl_socket_t socket, int what, void *ctx, break; default: - actx_error(actx, "unknown libcurl socket operation: %d", what); + actx_error_internal(actx, "unknown libcurl socket operation: %d", what); return -1; } @@ -1344,7 +1350,7 @@ register_socket(CURL *curl, curl_socket_t socket, int what, void *ctx, res = kevent(actx->mux, ev, nev, ev_out, nev, &timeout); if (res < 0) { - actx_error(actx, "could not modify kqueue: %m"); + actx_error_internal(actx, "could not modify kqueue: %m"); return -1; } @@ -1368,10 +1374,10 @@ register_socket(CURL *curl, curl_socket_t socket, int what, void *ctx, switch (what) { case CURL_POLL_REMOVE: - actx_error(actx, "could not delete from kqueue: %m"); + actx_error_internal(actx, "could not delete from kqueue: %m"); break; default: - actx_error(actx, "could not add to kqueue: %m"); + actx_error_internal(actx, "could not add to kqueue: %m"); } return -1; } @@ -1420,7 +1426,7 @@ comb_multiplexer(struct async_ctx *actx) */ if (kevent(actx->mux, NULL, 0, &ev, 1, &timeout) < 0) { - actx_error(actx, "could not comb kqueue: %m"); + actx_error_internal(actx, "could not comb kqueue: %m"); return false; } @@ -1470,7 +1476,7 @@ set_timer(struct async_ctx *actx, long timeout) if (timerfd_settime(actx->timerfd, 0 /* no flags */ , &spec, NULL) < 0) { - actx_error(actx, "setting timerfd to %ld: %m", timeout); + actx_error_internal(actx, "setting timerfd to %ld: %m", timeout); return false; } @@ -1500,14 +1506,14 @@ set_timer(struct async_ctx *actx, long timeout) EV_SET(&ev, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0); if (kevent(actx->timerfd, &ev, 1, NULL, 0, NULL) < 0 && errno != ENOENT) { - actx_error(actx, "deleting kqueue timer: %m"); + actx_error_internal(actx, "deleting kqueue timer: %m"); return false; } EV_SET(&ev, actx->timerfd, EVFILT_READ, EV_DELETE, 0, 0, 0); if (kevent(actx->mux, &ev, 1, NULL, 0, NULL) < 0 && errno != ENOENT) { - actx_error(actx, "removing kqueue timer from multiplexer: %m"); + actx_error_internal(actx, "removing kqueue timer from multiplexer: %m"); return false; } @@ -1518,14 +1524,14 @@ set_timer(struct async_ctx *actx, long timeout) EV_SET(&ev, 1, EVFILT_TIMER, (EV_ADD | EV_ONESHOT), 0, timeout, 0); if (kevent(actx->timerfd, &ev, 1, NULL, 0, NULL) < 0) { - actx_error(actx, "setting kqueue timer to %ld: %m", timeout); + actx_error_internal(actx, "setting kqueue timer to %ld: %m", timeout); return false; } EV_SET(&ev, actx->timerfd, EVFILT_READ, EV_ADD, 0, 0, 0); if (kevent(actx->mux, &ev, 1, NULL, 0, NULL) < 0) { - actx_error(actx, "adding kqueue timer to multiplexer: %m"); + actx_error_internal(actx, "adding kqueue timer to multiplexer: %m"); return false; } @@ -2153,7 +2159,7 @@ finish_discovery(struct async_ctx *actx) /* * Pull the fields we care about from the document. */ - actx->errctx = "failed to parse OpenID discovery document"; + actx->errctx = libpq_gettext("failed to parse OpenID discovery document"); if (!parse_provider(actx, &actx->provider)) return false; /* error message already set */ @@ -2421,7 +2427,7 @@ finish_device_authz(struct async_ctx *actx) */ if (response_code == 200) { - actx->errctx = "failed to parse device authorization"; + actx->errctx = libpq_gettext("failed to parse device authorization"); if (!parse_device_authz(actx, &actx->authz)) return false; /* error message already set */ @@ -2509,7 +2515,7 @@ finish_token_request(struct async_ctx *actx, struct token *tok) */ if (response_code == 200) { - actx->errctx = "failed to parse access token response"; + actx->errctx = libpq_gettext("failed to parse access token response"); if (!parse_access_token(actx, tok)) return false; /* error message already set */ @@ -2888,7 +2894,7 @@ pg_fe_run_oauth_flow_impl(PGconn *conn) switch (actx->step) { case OAUTH_STEP_INIT: - actx->errctx = "failed to fetch OpenID discovery document"; + actx->errctx = libpq_gettext("failed to fetch OpenID discovery document"); if (!start_discovery(actx, conn_oauth_discovery_uri(conn))) goto error_return; @@ -2902,11 +2908,11 @@ pg_fe_run_oauth_flow_impl(PGconn *conn) if (!check_issuer(actx, conn)) goto error_return; - actx->errctx = "cannot run OAuth device authorization"; + actx->errctx = libpq_gettext("cannot run OAuth device authorization"); if (!check_for_device_flow(actx)) goto error_return; - actx->errctx = "failed to obtain device authorization"; + actx->errctx = libpq_gettext("failed to obtain device authorization"); if (!start_device_authz(actx, conn)) goto error_return; @@ -2917,7 +2923,7 @@ pg_fe_run_oauth_flow_impl(PGconn *conn) if (!finish_device_authz(actx)) goto error_return; - actx->errctx = "failed to obtain access token"; + actx->errctx = libpq_gettext("failed to obtain access token"); if (!start_token_request(actx, conn)) goto error_return; @@ -2968,7 +2974,7 @@ pg_fe_run_oauth_flow_impl(PGconn *conn) break; case OAUTH_STEP_WAIT_INTERVAL: - actx->errctx = "failed to obtain access token"; + actx->errctx = libpq_gettext("failed to obtain access token"); if (!start_token_request(actx, conn)) goto error_return; @@ -2994,7 +3000,7 @@ error_return: * also the documentation for struct async_ctx. */ if (actx->errctx) - appendPQExpBuffer(errbuf, "%s: ", libpq_gettext(actx->errctx)); + appendPQExpBuffer(errbuf, "%s: ", actx->errctx); if (PQExpBufferDataBroken(actx->errbuf)) appendPQExpBufferStr(errbuf, libpq_gettext("out of memory")); diff --git a/src/interfaces/libpq/fe-auth-oauth.c b/src/interfaces/libpq/fe-auth-oauth.c index d146c5f567c..fb63a3249d6 100644 --- a/src/interfaces/libpq/fe-auth-oauth.c +++ b/src/interfaces/libpq/fe-auth-oauth.c @@ -183,7 +183,14 @@ struct json_ctx #define oauth_json_has_error(ctx) \ (PQExpBufferDataBroken((ctx)->errbuf) || (ctx)->errmsg) -#define oauth_json_set_error(ctx, ...) \ +#define oauth_json_set_error(ctx, fmt, ...) \ + do { \ + appendPQExpBuffer(&(ctx)->errbuf, libpq_gettext(fmt), ##__VA_ARGS__); \ + (ctx)->errmsg = (ctx)->errbuf.data; \ + } while (0) + +/* An untranslated version of oauth_json_set_error(). */ +#define oauth_json_set_error_internal(ctx, ...) \ do { \ appendPQExpBuffer(&(ctx)->errbuf, __VA_ARGS__); \ (ctx)->errmsg = (ctx)->errbuf.data; \ @@ -199,13 +206,13 @@ oauth_json_object_start(void *state) Assert(ctx->nested == 1); oauth_json_set_error(ctx, - libpq_gettext("field \"%s\" must be a string"), + "field \"%s\" must be a string", ctx->target_field_name); } ++ctx->nested; if (ctx->nested > MAX_SASL_NESTING_LEVEL) - oauth_json_set_error(ctx, libpq_gettext("JSON is too deeply nested")); + oauth_json_set_error(ctx, "JSON is too deeply nested"); return oauth_json_has_error(ctx) ? JSON_SEM_ACTION_FAILED : JSON_SUCCESS; } @@ -254,20 +261,20 @@ oauth_json_array_start(void *state) if (!ctx->nested) { - ctx->errmsg = libpq_gettext("top-level element must be an object"); + oauth_json_set_error(ctx, "top-level element must be an object"); } else if (ctx->target_field) { Assert(ctx->nested == 1); oauth_json_set_error(ctx, - libpq_gettext("field \"%s\" must be a string"), + "field \"%s\" must be a string", ctx->target_field_name); } ++ctx->nested; if (ctx->nested > MAX_SASL_NESTING_LEVEL) - oauth_json_set_error(ctx, libpq_gettext("JSON is too deeply nested")); + oauth_json_set_error(ctx, "JSON is too deeply nested"); return oauth_json_has_error(ctx) ? JSON_SEM_ACTION_FAILED : JSON_SUCCESS; } @@ -288,7 +295,7 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (!ctx->nested) { - ctx->errmsg = libpq_gettext("top-level element must be an object"); + oauth_json_set_error(ctx, "top-level element must be an object"); return JSON_SEM_ACTION_FAILED; } @@ -301,9 +308,9 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) * Assert and don't continue any further for production builds. */ Assert(false); - oauth_json_set_error(ctx, - "internal error: target scalar found at nesting level %d during OAUTHBEARER parsing", - ctx->nested); + oauth_json_set_error_internal(ctx, + "internal error: target scalar found at nesting level %d during OAUTHBEARER parsing", + ctx->nested); return JSON_SEM_ACTION_FAILED; } @@ -314,7 +321,7 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (*ctx->target_field) { oauth_json_set_error(ctx, - libpq_gettext("field \"%s\" is duplicated"), + "field \"%s\" is duplicated", ctx->target_field_name); return JSON_SEM_ACTION_FAILED; } @@ -323,7 +330,7 @@ oauth_json_scalar(void *state, char *token, JsonTokenType type) if (type != JSON_TOKEN_STRING) { oauth_json_set_error(ctx, - libpq_gettext("field \"%s\" must be a string"), + "field \"%s\" must be a string", ctx->target_field_name); return JSON_SEM_ACTION_FAILED; } diff --git a/src/interfaces/libpq/nls.mk b/src/interfaces/libpq/nls.mk index b87df277d93..3fa87a0aaac 100644 --- a/src/interfaces/libpq/nls.mk +++ b/src/interfaces/libpq/nls.mk @@ -1,6 +1,7 @@ # src/interfaces/libpq/nls.mk CATALOG_NAME = libpq GETTEXT_FILES = fe-auth.c \ + fe-auth-oauth.c \ fe-auth-scram.c \ fe-cancel.c \ fe-connect.c \ @@ -21,6 +22,7 @@ GETTEXT_TRIGGERS = actx_error:2 \ libpq_append_error:2 \ libpq_gettext \ libpq_ngettext:1,2 \ + oauth_json_set_error:2 \ oauth_parse_set_error:2 \ pqInternalNotice:2 GETTEXT_FLAGS = actx_error:2:c-format \ @@ -29,5 +31,6 @@ GETTEXT_FLAGS = actx_error:2:c-format \ libpq_gettext:1:pass-c-format \ libpq_ngettext:1:pass-c-format \ libpq_ngettext:2:pass-c-format \ + oauth_json_set_error:2:c-format \ oauth_parse_set_error:2:c-format \ pqInternalNotice:2:c-format diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c index 417b61f31c5..25340e8d81b 100644 --- a/src/test/modules/injection_points/injection_points.c +++ b/src/test/modules/injection_points/injection_points.c @@ -115,7 +115,7 @@ static shmem_startup_hook_type prev_shmem_startup_hook = NULL; * when initializing dynamically with a DSM or when loading the module. */ static void -injection_point_init_state(void *ptr) +injection_point_init_state(void *ptr, void *arg) { InjectionPointSharedState *state = (InjectionPointSharedState *) ptr; @@ -159,7 +159,7 @@ injection_shmem_startup(void) * First time through, so initialize. This is shared with the dynamic * initialization using a DSM. */ - injection_point_init_state(inj_state); + injection_point_init_state(inj_state, NULL); } LWLockRelease(AddinShmemInitLock); @@ -179,7 +179,7 @@ injection_init_shmem(void) inj_state = GetNamedDSMSegment("injection_points", sizeof(InjectionPointSharedState), injection_point_init_state, - &found); + &found, NULL); } /* diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c index 21e4ce6a745..b9c45c0d41c 100644 --- a/src/test/modules/test_dsa/test_dsa.c +++ b/src/test/modules/test_dsa/test_dsa.c @@ -21,7 +21,7 @@ PG_MODULE_MAGIC; static void -init_tranche(void *ptr) +init_tranche(void *ptr, void *arg) { int *tranche_id = (int *) ptr; @@ -39,7 +39,7 @@ test_dsa_basic(PG_FUNCTION_ARGS) dsa_pointer p[100]; tranche_id = GetNamedDSMSegment("test_dsa", sizeof(int), - init_tranche, &found); + init_tranche, &found, NULL); a = dsa_create(*tranche_id); for (int i = 0; i < 100; i++) @@ -80,7 +80,7 @@ test_dsa_resowners(PG_FUNCTION_ARGS) ResourceOwner childowner; tranche_id = GetNamedDSMSegment("test_dsa", sizeof(int), - init_tranche, &found); + init_tranche, &found, NULL); /* Create DSA in parent resource owner */ a = dsa_create(*tranche_id); diff --git a/src/test/modules/test_dsm_registry/test_dsm_registry.c b/src/test/modules/test_dsm_registry/test_dsm_registry.c index 4cc2ccdac3f..3e5f4f3cda1 100644 --- a/src/test/modules/test_dsm_registry/test_dsm_registry.c +++ b/src/test/modules/test_dsm_registry/test_dsm_registry.c @@ -44,10 +44,13 @@ static const dshash_parameters dsh_params = { }; static void -init_tdr_dsm(void *ptr) +init_tdr_dsm(void *ptr, void *arg) { TestDSMRegistryStruct *dsm = (TestDSMRegistryStruct *) ptr; + if ((int) (intptr_t) arg != 5432) + elog(ERROR, "unexpected arg value %d", (int) (intptr_t) arg); + LWLockInitialize(&dsm->lck, LWLockNewTrancheId("test_dsm_registry")); dsm->val = 0; } @@ -60,7 +63,7 @@ tdr_attach_shmem(void) tdr_dsm = GetNamedDSMSegment("test_dsm_registry_dsm", sizeof(TestDSMRegistryStruct), init_tdr_dsm, - &found); + &found, (void *) (intptr_t) 5432); if (tdr_dsa == NULL) tdr_dsa = GetNamedDSA("test_dsm_registry_dsa", &found); diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 523a5cd5b52..e93248bd66e 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -58,6 +58,7 @@ tests += { 't/047_checkpoint_physical_slot.pl', 't/048_vacuum_horizon_floor.pl', 't/049_wait_for_lsn.pl', + 't/050_redo_segment_missing.pl', ], }, } diff --git a/src/test/recovery/t/050_redo_segment_missing.pl b/src/test/recovery/t/050_redo_segment_missing.pl new file mode 100644 index 00000000000..f5eb6c30fe6 --- /dev/null +++ b/src/test/recovery/t/050_redo_segment_missing.pl @@ -0,0 +1,117 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# Evaluates PostgreSQL's recovery behavior when a WAL segment containing the +# redo record is missing, with a checkpoint record located in a different +# segment. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('testnode'); +$node->init; +$node->append_conf('postgresql.conf', 'log_checkpoints = on'); +$node->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Note that this uses two injection points based on waits, not one. This +# may look strange, but this works as a workaround to enforce all memory +# allocations to happen outside the critical section of the checkpoint +# required for this test. +# First, "create-checkpoint-initial" is run outside the critical section +# section, and is used as a way to initialize the shared memory required +# for the wait machinery with its DSM registry. +# Then, "create-checkpoint-run" is loaded outside the critical section of +# a checkpoint to allocate any memory required by the library load, and +# its callback is run inside the critical section. +$node->safe_psql('postgres', + q{select injection_points_attach('create-checkpoint-initial', 'wait')}); +$node->safe_psql('postgres', + q{select injection_points_attach('create-checkpoint-run', 'wait')}); + +# Start a psql session to run the checkpoint in the background and make +# the test wait on the injection point so the checkpoint stops just after +# it starts. +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +)); + +# Wait for the initial point to finish, the checkpointer is still +# outside its critical section. Then release to reach the second +# point. +$node->wait_for_event('checkpointer', 'create-checkpoint-initial'); +$node->safe_psql('postgres', + q{select injection_points_wakeup('create-checkpoint-initial')}); + +# Wait until the checkpoint has reached the second injection point. +# We are now in the middle of a checkpoint running, after the redo +# record has been logged. +$node->wait_for_event('checkpointer', 'create-checkpoint-run'); + +# Switch the WAL segment, ensuring that the redo record will be included +# in a different segment than the checkpoint record. +$node->safe_psql('postgres', 'SELECT pg_switch_wal()'); + +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{select injection_points_wakeup('create-checkpoint-run')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +$checkpoint->quit; + +# Retrieve the WAL file names for the redo record and checkpoint record. +my $redo_lsn = $node->safe_psql('postgres', + "SELECT redo_lsn FROM pg_control_checkpoint()"); +my $redo_walfile_name = + $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')"); +my $checkpoint_lsn = $node->safe_psql('postgres', + "SELECT checkpoint_lsn FROM pg_control_checkpoint()"); +my $checkpoint_walfile_name = + $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')"); + +# Redo record and checkpoint record should be on different segments. +isnt($redo_walfile_name, $checkpoint_walfile_name, + 'redo and checkpoint records on different segments'); + +# Remove the WAL segment containing the redo record. +unlink $node->data_dir . "/pg_wal/$redo_walfile_name" + or die "could not remove WAL file: $!"; + +$node->stop('immediate'); + +# Use run_log instead of node->start because this test expects that +# the server ends with an error during recovery. +run_log( + [ + 'pg_ctl', + '--pgdata' => $node->data_dir, + '--log' => $node->logfile, + 'start', + ]); + +# Confirm that recovery has failed, as expected. +my $logfile = slurp_file($node->logfile()); +ok( $logfile =~ + qr/FATAL: .* could not find redo location .* referenced by checkpoint record at .*/, + "ends with FATAL because it could not find redo location"); + +done_testing(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3451538565e..04845d5e680 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -366,6 +366,7 @@ BulkWriteBuffer BulkWriteState BumpBlock BumpContext +ByteaSortSupport CACHESIGN CAC_state CCFastEqualFN |
