PgSQL · 特性分析 · PostgreSQL Aurora方案与DEMO

前言

亚马逊推出的Aurora数据库引擎,支持一份存储,一主多读的架构。这个架构和Oracle RAC类似,也是共享存储,但是只有一个实例可以执行写操作,其他实例只能执行读操作。相比传统的基于复制的一主多读,节约了存储和网络带宽的成本。

我们可以使用Postgresql的hot standby模式来模拟这种共享存储一主多读的架构,但是需要注意几点,hot standby也会对数据库有写的动作,例如recovery时,会修改控制文件,数据文件等等,这些操作是多余的。另外很多状态是存储在内存中的,所以内存状态也需要更新。

还有需要注意的是:

pg_xlog
pg_log
pg_clog
pg_multixact
postgresql.conf
recovery.conf
postmaster.pid

最终实现一主多备的架构,需要通过改PG内核来实现:

  1. 这些文件应该是每个实例对应一份。
    postgresql.conf,recovery.conf,postmaster.pid,pg_control
  2. hot standby不执行实际的恢复操作,但是需要更新自己的内存状态,如当前的OID,XID等等,以及更新自己的pg_control。
  3. 在多实例间,要实现主到备节点的OS脏页的同步,数据库shared buffer脏页的同步。

模拟过程

不改任何代码,在同一主机下启多实例测试,会遇到一些问题。(后面有问题描述,以及如何修改代码来修复这些问题)

主实例配置文件

# vi postgresql.conf
listen_addresses='0.0.0.0'
port=1921
max_connections=100
unix_socket_directories='.'
ssl=on
ssl_ciphers='EXPORT40'
shared_buffers=512MB
huge_pages=try
max_prepared_transactions=0
max_stack_depth=100kB
dynamic_shared_memory_type=posix
max_files_per_process=500
wal_level=logical
fsync=off
synchronous_commit=off
wal_sync_method=open_datasync
full_page_writes=off
wal_log_hints=off
wal_buffers=16MB
wal_writer_delay=10ms
checkpoint_segments=8
archive_mode=off
archive_command='/bin/date'
max_wal_senders=10
max_replication_slots=10
hot_standby=on
wal_receiver_status_interval=1s
hot_standby_Feedback=on
enable_bitmapscan=on
enable_hashagg=on
enable_hashjoin=on
enable_indexscan=on
enable_material=on
enable_mergejoin=on
enable_nestloop=on
enable_seqscan=on
enable_sort=on
enable_tidscan=on
log_destination='csvlog'
logging_collector=on
log_directory='pg_log'
log_truncate_on_rotation=on
log_rotation_size=10MB
log_checkpoints=on
log_connections=on
log_disconnections=on
log_duration=off
log_error_verbosity=verbose
log_line_prefix='%i
log_statement='none'
log_timezone='PRC'
autovacuum=on
log_autovacuum_min_duration=0
autovacuum_vacuum_scale_factor=0.0002
autovacuum_analyze_scale_factor=0.0001
datestyle='iso,timezone='PRC'
lc_messages='C'
lc_monetary='C'
lc_numeric='C'
lc_time='C'
default_text_search_config='pg_catalog.english'

 # vi recovery.done
recovery_target_timeline='latest'
standby_mode=on
primary_conninfo = 'host=127.0.0.1 port=1921 user=postgres keepalives_idle=60'

 # vi pg_hba.conf
local   replication     postgres                                trust
host    replication     postgres 127.0.0.1/32            trust

启动主实例。

postgres@digoal-> pg_ctl start

启动只读实例,必须先删除postmaster.pid,这点Postgresql新版本加了一个PATCH,如果这个文件删除,会自动关闭数据库,所以我们需要注意,不要使用最新的PGsql,或者把这个patch干掉先。

postgres@digoal-> cd $PGDATA
postgres@digoal-> mv recovery.done recovery.conf

postgres@digoal-> rm -f postmaster.pid
postgres@digoal-> pg_ctl start -o "-c log_directory=pg_log1922 -c port=1922"

查看当前控制文件状态,只读实例改了控制文件,和前面描述一致。

postgres@digoal-> pg_controldata |grep state
Database cluster state:               in archive recovery

连到主实例,创建表,插入测试数据。

psql -p 1921
postgres=# create table test1(id int);
CREATE TABLE postgres=# insert into test1 select generate_series(1,10);
INSERT 0 10 

在只读实例查看插入的数据。

postgres@digoal-> psql -h 127.0.0.1 -p 1922
postgres=# select * from test1;
 id
----
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
(10 rows)

主实例执行检查点后,控制文件状态会改回生产状态。

psql -p 1921
postgres=# checkpoint;
CHECKPOINT

postgres@digoal-> pg_controldata |grep state
Database cluster state:               in production

但是如果在只读实例执行完检查点,又会改回恢复状态。

postgres@digoal-> psql -h 127.0.0.1 -p 1922
psql (9.4.4)
postgres=# checkpoint;
CHECKPOINT

postgres@digoal-> pg_controldata |grep state
Database cluster state:               in archive recovery

注意到,上面的例子有1个问题,用流复制的话,会从主节点通过网络拷贝XLOG记录,并覆盖同一份已经写过的XLOG记录的对应的OFFSET,这是一个问题,因为可能会造成主节点看到的数据不一致(比如一个数据块改了多次,只读实例在恢复时将它覆盖到老的版本了,在主实例上看到的就会变成老版本的BLOCK,后面再来改这个问题,禁止只读实例恢复数据)。

另一方面,我们知道Postgresql standby会从三个地方(流、pg_xlog、restore_command)读取XLOG进行恢复,所以在共享存储的环境中,我们完全没有必要用流复制的方式,直接从pg_xlog目录读取即可。修改recovery.conf参数,将以下注释

# primary_conninfo = 'host=127.0.0.1 port=1921 user=postgres keepalives_idle=60'

重启只读实例。

pg_ctl stop -m fast postgres@digoal-> pg_ctl start -o "-c log_directory=pg_log1922 -c port=1922" 

重新测试数据一致性。
主实例:

postgres=# insert into test1 select generate_series(1,10);
INSERT 0 10 postgres=# insert into test1 select generate_series(1,10);
INSERT 0 10 

只读实例:

postgres=# select count(*) from test1;
 count
-------
    60
(1 row)

问题分析和解决

截至目前,有几个问题未解决

  1. standby还是要执行recovery的操作,recovery产生的write操作会随着只读实例数量增加增加。另外recovery有一个好处,解决了脏页的问题,主实例shared buffer中的脏页不需要额外的同步给只读实例了。recovery还会带来一个严重的BUG,回放可能和当前主节点操作同一个data page;或者回放时将块回放到老的状态,而实际上主节点又更新了这个块,造成数据块的不一致。如果此时只读实例关闭,然后立即关闭主实例,数据库再起来时,这个数据块是不一致的;
  2. standby还是会改控制文件
  3. 在同一个$PGDATA下启动实例,首先要删除postmaster.pid;
  4. 关闭实例时,已经被删除postmaster.pid的实例,只能通过找到postgres主进程的pid,然后发kill -s 15,2或3的信号来关闭数据库

    static void set_mode(char *modeopt) {
             if (strcmp(modeopt,"s") == 0 || strcmp(modeopt,"smart") == 0)
             {
                     shutdown_mode = SMART_MODE;
                     sig = SIGTERM;
             }
             else if (strcmp(modeopt,"f") == 0 || strcmp(modeopt,"fast") == 0)
             {
                     shutdown_mode = FAST_MODE;
                     sig = SIGINT;
             }
             else if (strcmp(modeopt,"i") == 0 || strcmp(modeopt,"immediate") == 0)
             {
                     shutdown_mode = IMMEDIATE_MODE;
                     sig = SIGQUIT;
             }
             else
             {
                     write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"),progname,modeopt);
                     do_advice();
                     exit(1);
             }
     }
  5. 当主节点删除rel page时,只读实例回放时,会报invalid xlog对应的rel page不存在的错误,这个也是只读实例需要回放日志带来的问题。非常容易重现这个问题,删除一个表即可。

    2015-10-09 13:30:50.776 CST,2082,561750ab.822,20,2015-10-09 13:29:15 CST,1/0,WARNING,01000,"page 8 of relation base/151898/185251 does not exist","xlog redo clean: rel 1663/151898/185251; blk 8 remxid 640632117","report_invalid_page,xlogutils.c:67",""
     2015-10-09 13:30:50.776 CST,21,PANIC,XX000,"WAL contains references to invalid pages","log_invalid_page,xlogutils.c:91",""

    这个报错可以先注释这一段来绕过,从而可以演示下去。

    src/backend/access/transam/xlogutils.c
     /* Log a reference to an invalid page */
     static void log_invalid_page(RelFileNode node,ForkNumber forkno,BlockNumber blkno,bool present) {
       //////
             /* * Once recovery has reached a consistent state,the invalid-page table * should be empty and remain so. If a reference to an invalid page is * found after consistency is reached,PANIC immediately. This might seem * aggressive,but it's better than letting the invalid reference linger * in the hash table until the end of recovery and PANIC there,which * might come only much later if this is a standby server. */
             //if (reachedConsistency)
             //{
             // report_invalid_page(WARNING,node,forkno,blkno,present);
             // elog(PANIC,"WAL contains references to invalid pages");
             //}
  6. 由于本例是在同一个操作系统中演示,所以没有遇到OS的dirty page cache的问题,如果是不同主机的环境,我们需要解决OS dirty page cache 的同步问题,或者消除dirty page cache,如使用direct IO。或者集群文件系统如gfs2。

如果要产品化,至少需要解决以上问题。

解决Aurora实例写数据文件、控制文件、检查点的问题。

  1. 增加一个启动参数,表示这个实例是否为Aurora实例(即只读实例)

    # vi src/backend/utils/misc/guc.c
     /******** option records follow ********/
    
     static struct config_bool ConfigureNamesBool[] =
     {
             {
                     {"aurora",PGC_POSTMASTER,CONN_AUTH_SETTINGS,gettext_noop("Enables advertising the server via Bonjour."),NULL
                     },&aurora,false,NULL,NULL
             },
  2. 新增变量

    # vi src/include/postmaster/postmaster.h
     extern bool aurora;
  3. 禁止Aurora实例更新控制文件

    # vi src/backend/access/transam/xlog.c
     #include "postmaster/postmaster.h"
     bool aurora;
    
     void UpdateControlFile(void) {
             if (aurora) return;
  4. 禁止Aurora实例启动bgwriter进程

    # vi src/backend/postmaster/bgwriter.c
     #include "postmaster/postmaster.h"
     bool  aurora;
    
     /* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain,which has already created the * basic execution environment,but not enabled signals yet. */
     void BackgroundWriterMain(void) {
       //////
             pg_usleep(1000000L);
    
             /* * If an exception is encountered,processing resumes here. * * See notes in postgres.c about the design of this coding. */
             if (!aurora && sigsetjmp(local_sigjmp_buf,1) != 0)
             {
    
       //////
                     /* * Do one cycle of dirty-buffer writing. */
                     if (!aurora) {
                     can_hibernate = BgBufferSync();
       //////
                     }
                     pg_usleep(1000000L);
             }
     }
  5. 禁止Aurora实例启动checkpointer进程

    # vi src/backend/postmaster/checkpointer.c
     #include "postmaster/postmaster.h"
     bool  aurora;
       //////
     /* * Main entry point for checkpointer process * * This is invoked from AuxiliaryProcessMain,but not enabled signals yet. */
     void CheckpointerMain(void) {
       //////
             /* * Loop forever */
             for (;;)
             {
                     bool            do_checkpoint = false;
                     int                     flags = 0;
                     pg_time_t       now;
                     int                     elapsed_secs;
                     int                     cur_timeout;
                     int                     rc;
    
                     pg_usleep(100000L);
    
                     /* Clear any already-pending wakeups */
                     if (!aurora)  ResetLatch(&MyProc->procLatch);
    
                     /* * Process any requests or signals received recently. */
                     if (!aurora) AbsorbFsyncRequests();
    
                     if (!aurora && got_SIGHUP)
                     {
                             got_SIGHUP = false;
                             ProcessConfigFile(PGC_SIGHUP);
    
                             /* * Checkpointer is the last process to shut down,so we ask it to * hold the keys for a range of other tasks required most of which * have nothing to do with checkpointing at all. * * For varIoUs reasons,some config values can change dynamically * so the primary copy of them is held in shared memory to make * sure all backends see the same value. We make Checkpointer * responsible for updating the shared memory copy if the * parameter setting changes because of SIGHUP. */
                             UpdateSharedMemoryConfig();
                     }
                     if (!aurora && checkpoint_requested)
                     {
                             checkpoint_requested = false;
                             do_checkpoint = true;
                             BgWriterStats.m_requested_checkpoints++;
                     }
                     if (!aurora && shutdown_requested)
                     {
                             /* * From here on,elog(ERROR) should end with exit(1),not send * control back to the sigsetjmp block above */
                             ExitOnAnyError = true;
                             /* Close down the database */
                             ShutdownXLOG(0,0);
                             /* Normal exit from the checkpointer is here */
                             proc_exit(0);           /* done */
                     }
    
                     /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request,but we set the CAUSE_TIME flag * bit even if there is also an external request. */
                     now = (pg_time_t) time(NULL);
                     elapsed_secs = now - last_checkpoint_time;
                     if (!aurora && elapsed_secs >= CheckPointTimeout)
                     {
                             if (!do_checkpoint)
                                     BgWriterStats.m_timed_checkpoints++;
                             do_checkpoint = true;
                             flags |= CHECKPOINT_CAUSE_TIME;
                     }
    
                     /* * Do a checkpoint if requested. */
                     if (!aurora && do_checkpoint)
                     {
                             bool            ckpt_performed = false;
                             bool            do_restartpoint;
    
                             /* use volatile pointer to prevent code rearrangement */
                             volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
    
                             /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect,RecoveryInProgress() initializes TimeLineID if * it's not set yet. */
                             do_restartpoint = RecoveryInProgress();
    
                             /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform,and increase the started-counter * to acknowledge that we've started a new checkpoint. */
                             SpinLockAcquire(&cps->ckpt_lck);
                             flags |= cps->ckpt_flags;
                             cps->ckpt_flags = 0;
                             cps->ckpt_started++;
                             SpinLockRelease(&cps->ckpt_lck);
    
                             /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */
                             if (flags & CHECKPOINT_END_OF_RECOVERY)
                                     do_restartpoint = false;
       //////
                             ckpt_active = false;
                     }
    
                     /* Check for archive_timeout and switch xlog files if necessary. */
                     if (!aurora) CheckArchiveTimeout();
                     /* * Send off activity statistics to the stats collector. (The reason * why we re-use bgwriter-related code for this is that the bgwriter * and checkpointer used to be just one process. It's probably not * worth the trouble to split the stats support into two independent * stats message types.) */
                     if (!aurora) pgstat_send_bgwriter();
    
                     /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. */
                     now = (pg_time_t) time(NULL);
                     elapsed_secs = now - last_checkpoint_time;
                     if (elapsed_secs >= CheckPointTimeout)
                             continue;                       /* no sleep for us ... */
                     cur_timeout = CheckPointTimeout - elapsed_secs;
                     if (!aurora && XLogArchiveTimeout > 0 && !RecoveryInProgress())
                     {
                             elapsed_secs = now - last_xlog_switch_time;
                             if (elapsed_secs >= XLogArchiveTimeout)
                                     continue;               /* no sleep for us ... */
                             cur_timeout = Min(cur_timeout,XLogArchiveTimeout - elapsed_secs);
                     }
    
                     if (!aurora) rc = WaitLatch(&MyProc->procLatch,WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,cur_timeout * 1000L /* convert to ms */ );
    
                     /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */
                     if (rc & WL_POSTMASTER_DEATH)
                             exit(1);
             }
     }
       //////
     /* SIGINT: set flag to run a normal checkpoint right away */
     static void ReqCheckpointHandler(SIGNAL_ARGS) {
             if (aurora)
                return;
             int                     save_errno = errno;
    
             checkpoint_requested = true;
             if (MyProc)
                     SetLatch(&MyProc->procLatch);
    
             errno = save_errno;
     }
       //////
     /* * AbsorbFsyncRequests * Retrieve queued fsync requests and pass them to local smgr. * * This is exported because it must be called during CreateCheckPoint; * we have to be sure we have accepted all pending requests just before * we start fsync'ing. Since CreateCheckPoint sometimes runs in * non-checkpointer processes,do nothing if not checkpointer. */
     void AbsorbFsyncRequests(void) {
             CheckpointerRequest *requests = NULL;
             CheckpointerRequest *request;
             int                     n;
    
             if (!AmCheckpointerProcess() || aurora)
                     return;
       //////
  6. 禁止Aurora实例手工调用checkpoint命令

    # vi src/backend/tcop/utility.c
     #include "postmaster/postmaster.h"
     bool  aurora;
    
       //////
     void standard_ProcessUtility(Node *parsetree,const char *queryString,ProcessUtilityContext context,ParamListInfo params,DestReceiver *dest,char *completionTag) {
       //////
                     case T_CheckPointStmt:
                        if (!superuser() || aurora)
                                     ereport(ERROR,(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),errmsg("must be superuser to do CHECKPOINT")));

改完上面的代码,重新编译一下,现在接近一个DEMO了。现在Aurora实例不会更新控制文件,不会写数据文件,不会执行checkpoint,是我们想要的结果。
启动只读实例时,加一个参数aurora=true,表示启动Aurora实例。

pg_ctl start -o "-c log_directory=pg_log1922 -c port=1922 -c aurora=true"

不过要产品化,还有很多细节需要考虑,这只是一个DEMO。阿里云RDS的小伙伴们加油!

还有一种更保险的玩法,共享存储多读架构,需要存储两份数据。其中一份是主实例的存储,它自己玩自己的,其他实例不对它做任何操作;另一份是standby的,这部作为共享存储,给多个只读实例来使用。

参考

  1. https://aws.amazon.com/cn/rds/aurora/
  2. src/backend/access/transam/xlog.c

    /*
      * Open the WAL segment containing WAL position 'RecPtr'.
      *
      * The segment can be fetched via restore_command,or via walreceiver having
      * streamed the record,or it can already be present in pg_xlog. Checking
      * pg_xlog is mainly for crash recovery,but it will be polled in standby mode
      * too,in case someone copies a new segment directly to pg_xlog. That is not
      * documented or recommended,though.
      *
      * If 'fetching_ckpt' is true,we're fetching a checkpoint record,and should * prepare to read WAL starting from RedoStartLSN after this. * * 'RecPtr' might not point to the beginning of the record we're interested
      * in,it might also point to the page or segment header. In that case,* 'tliRecPtr' is the position of the WAL record we're interested in. It is * used to decide which timeline to stream the requested WAL from. * * If the record is not immediately available,the function returns false * if we're not in standby mode. In standby mode,waits for it to become
      * available.
      *
      * When the requested record becomes available,the function opens the file
      * containing it (if not open already),and returns true. When end of standby
      * mode is triggered by the user,and there is no more WAL available,returns
      * false.
      */ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr) { ////// static pg_time_t last_fail_time = 0; pg_time_t now; /*-------
              * Standby mode is implemented by a state machine:
              *
              * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE),or just
              *        pg_xlog (XLOG_FROM_XLOG)
              * 2. Check trigger file
              * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
              * 4. Rescan timelines
              * 5. Sleep 5 seconds,and loop back to 1.
              *
              * Failure to read from the current source advances the state machine to
              * the next state.
              *
              * 'currentSource' indicates the current state. There are no currentSource
              * values for "check trigger","rescan timelines",and "sleep" states,* those actions are taken when reading from the prevIoUs source fails,as
              * part of advancing to the next state.
              *-------
              */

相关文章

来源:http://www.postgres.cn/docs/11/ 4.1.1. 标识符和关键词 SQL标识符和关键词必须以一个...
来源:http://www.postgres.cn/docs/11/ 8.1. 数字类型 数字类型由2、4或8字节的整数以及4或8...
来源:http://www.postgres.cn/docs/11/ 5.1. 表基础 SQL并不保证表中行的顺序。当一个表被读...
来源:http://www.postgres.cn/docs/11/ 6.4. 从修改的行中返回数据 有时在修改行的操作过程中...
来源:http://www.postgres.cn/docs/11/ 13.2.1. 读已提交隔离级别 读已提交是PostgreSQL中的...
来源:http://www.postgres.cn/docs/11/ 9.7. 模式匹配 PostgreSQL提供了三种独立的实现模式匹...