1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
|
/*
* slru_io.c
*
* Routines for reading and writing SLRU files during upgrade.
*
* Copyright (c) 2025, PostgreSQL Global Development Group
* src/bin/pg_upgrade/slru_io.c
*/
#include "postgres_fe.h"
#include <fcntl.h>
#include "common/fe_memutils.h"
#include "common/file_perm.h"
#include "common/file_utils.h"
#include "pg_upgrade.h"
#include "port/pg_iovec.h"
#include "slru_io.h"
static SlruSegState *AllocSlruSegState(const char *dir);
static char *SlruFileName(SlruSegState *state, int64 segno);
static void SlruFlush(SlruSegState *state);
/* common parts of AllocSlruRead and AllocSlruWrite */
static SlruSegState *
AllocSlruSegState(const char *dir)
{
SlruSegState *state = pg_malloc(sizeof(*state));
state->dir = pstrdup(dir);
state->fn = NULL;
state->fd = -1;
state->segno = -1;
state->pageno = 0;
/* state->writing and state->long_segment_names must be set by caller! */
return state;
}
/* similar to the backend function with the same name */
static char *
SlruFileName(SlruSegState *state, int64 segno)
{
if (state->long_segment_names)
{
Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
return psprintf("%s/%015" PRIX64, state->dir, segno);
}
else
{
Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
return psprintf("%s/%04X", state->dir, (unsigned int) segno);
}
}
/*
* Create SLRU reader for dir.
*/
SlruSegState *
AllocSlruRead(const char *dir, bool long_segment_names)
{
SlruSegState *state = AllocSlruSegState(dir);
state->writing = false;
state->long_segment_names = long_segment_names;
return state;
}
/*
* Read the given page into memory buffer.
*
* Reading can be done in random order.
*
* If the file containing 'pageno' does not exist, a fatal error is raised.
* If the file exists but is shorter than expected, the missing part is read
* as zeros and a warning is logged. That is reasonable behavior for current
* callers.
*
* This is the slow path of the inlineable SlruReadSwitchPage() function.
*/
char *
SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno)
{
int64 segno;
off_t offset;
ssize_t bytes_read;
Assert(!state->writing); /* read only mode */
if (state->segno != -1 && pageno == state->pageno)
return state->buf.data;
/* If the new page is on a different SLRU segment, open the new segment */
segno = pageno / SLRU_PAGES_PER_SEGMENT;
if (segno != state->segno)
{
if (state->segno != -1)
{
close(state->fd);
state->fd = -1;
pg_free(state->fn);
state->fn = NULL;
state->segno = -1;
}
state->fn = SlruFileName(state, segno);
if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0)
pg_fatal("could not open file \"%s\": %m", state->fn);
state->segno = segno;
}
offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
bytes_read = 0;
while (bytes_read < BLCKSZ)
{
ssize_t rc;
rc = pg_pread(state->fd,
&state->buf.data + bytes_read,
BLCKSZ - bytes_read,
offset);
if (rc < 0)
{
if (errno == EINTR)
continue;
pg_fatal("could not read file \"%s\": %m", state->fn);
}
if (rc == 0)
{
/* unexpected EOF */
pg_log(PG_WARNING, "unexpected EOF reading file \"%s\" at offset %u, reading as zeros",
state->fn, (unsigned int) offset);
memset(&state->buf.data + bytes_read, 0, BLCKSZ - bytes_read);
break;
}
bytes_read += rc;
offset += rc;
}
state->pageno = pageno;
return state->buf.data;
}
/*
* Free the reader.
*/
void
FreeSlruRead(SlruSegState *state)
{
Assert(!state->writing); /* read only mode */
if (state->fd != -1)
close(state->fd);
pg_free(state);
}
/*
* Create SLRU writer for dir.
*/
SlruSegState *
AllocSlruWrite(const char *dir, bool long_segment_names)
{
SlruSegState *state = AllocSlruSegState(dir);
state->writing = true;
state->long_segment_names = long_segment_names;
return state;
}
/*
* Open the given page for writing.
*
* NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that
* each segment is written in full before moving on to the next one. This
* limitation would be easy to lift if needed, but it fits the usage pattern
* of current callers.
*
* This is the slow path of the inlineable SlruWriteSwitchPage() function.
*/
char *
SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno)
{
int64 segno;
off_t offset;
Assert(state->writing);
if (state->segno != -1 && pageno == state->pageno)
return state->buf.data;
segno = pageno / SLRU_PAGES_PER_SEGMENT;
offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
SlruFlush(state);
memset(state->buf.data, 0, BLCKSZ);
if (segno != state->segno)
{
if (state->segno != -1)
{
close(state->fd);
state->fd = -1;
pg_free(state->fn);
state->fn = NULL;
state->segno = -1;
}
/* Create the segment */
state->fn = SlruFileName(state, segno);
if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
pg_file_create_mode)) < 0)
{
pg_fatal("could not create file \"%s\": %m", state->fn);
}
state->segno = segno;
if (offset > 0)
{
if (pg_pwrite_zeros(state->fd, offset, 0) < 0)
pg_fatal("could not write file \"%s\": %m", state->fn);
}
}
state->pageno = pageno;
return state->buf.data;
}
static void
SlruFlush(SlruSegState *state)
{
struct iovec iovec = {
.iov_base = &state->buf,
.iov_len = BLCKSZ,
};
off_t offset;
if (state->segno == -1)
return;
offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0)
pg_fatal("could not write file \"%s\": %m", state->fn);
}
/*
* Free the writer.
*/
void
FreeSlruWrite(SlruSegState *state)
{
Assert(state->writing);
SlruFlush(state);
if (state->fd != -1)
close(state->fd);
pg_free(state);
}
|