更新时间:2024-11-12 GMT+08:00
分享

案例:使用全局二级索引加速查询

全局二级索引仅索引扫描

在对基表非分布列进行查询时,因无法进行DN下推查询或利用索引扫描,产生性能问题。

  • 场景一:普通查询。
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    gaussdb=# CREATE TABLE t (a INT, b INT, c INT) DISTRIBUTE BY HASH(a);
    CREATE TABLE
    gaussdb=# INSERT INTO t VALUES(generate_series(1,1000), generate_series(1,1000), generate_series(1,1000));
    INSERT 0 1000
    gaussdb=# CREATE INDEX i ON t(b);
    CREATE INDEX
    gaussdb=# SET MAX_DATANODE_FOR_PLAN = 1;
    SET
    gaussdb=# SET ENABLE_FAST_QUERY_SHIPPING = TRUE;
    SET
    gaussdb=# SET ENABLE_LIGHT_PROXY = TRUE;
    SET
    gaussdb=# SET ENABLE_STREAM_OPERATOR = FALSE;
    SET
    gaussdb=# -- 点查,查询条件不是分布列,下推所有DN,在DN上使用普通索引进行查询
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF)  SELECT b FROM t WHERE b = 1;
                         QUERY PLAN                     
    ----------------------------------------------------
     Data Node Scan
       Output: t.b
       Node/s: All datanodes
       Remote query: SELECT b FROM public.t WHERE b = 1
    
     Remote SQL: SELECT b FROM public.t WHERE b = 1
     Datanode Name: datanode1
       [Bypass]
       Index Only Scan using i on public.t
         Output: b
         Index Cond: (t.b = 1)
    
    (12 rows)
    
    gaussdb=# -- 范围查询,在所有DN进行顺序扫描
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF)  SELECT b FROM t WHERE b <= 10;
                          QUERY PLAN                      
    ------------------------------------------------------
     Data Node Scan
       Output: t.b
       Node/s: All datanodes
       Remote query: SELECT b FROM public.t WHERE b <= 10
    
     Remote SQL: SELECT b FROM public.t WHERE b <= 10
     Datanode Name: datanode1
       Seq Scan on public.t
         Output: b
         Filter: (t.b <= 10)
    
    (11 rows)
    
    通过创建全局二级索引,使上述查询进行DN下推,利用索引扫描加速查询。(使用GSI进行查询优化,需要开启ENABLE_GSISCAN参数以及ENABLE_FAST_QUERY_SHIPPING参数)
    gaussdb=# DROP INDEX i;
    DROP INDEX
    gaussdb=# -- 创建全局二级索引
    gaussdb=# CREATE GLOBAL INDEX gsi_t ON t(b);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'b' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE GLOBAL INDEX
    gaussdb=# -- 点查,查询条件落到全局二级索引分布列上,下推单个DN,并使用索引扫描查询
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT b FROM t WHERE b = 1;
                             QUERY PLAN                          
    -------------------------------------------------------------
     Data Node Scan
       Output: gsi_t.b
       Node/s: datanode1
       Remote query: SELECT b FROM ONLY public.gsi_t WHERE b = 1
    
     Remote SQL: SELECT b FROM ONLY public.gsi_t WHERE b = 1
     Datanode Name: datanode1
       [Bypass]
       Index Only Scan using gsi_t on public.gsi_t
         Output: b
         Index Cond: (gsi_t.b = 1)
    
    (12 rows)
    
    gaussdb=# -- 范围查询,下推所有DN,并使用索引扫描查询
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT b FROM t WHERE b <= 10;
                              QUERY PLAN                           
    ---------------------------------------------------------------
     Data Node Scan
       Output: gsi_t.b
       Node/s: All datanodes
       Remote query: SELECT b FROM ONLY public.gsi_t WHERE b <= 10
    
     Remote SQL: SELECT b FROM ONLY public.gsi_t WHERE b <= 10
     Datanode Name: datanode1
       [Bypass]
       Index Only Scan using gsi_t on public.gsi_t
         Output: b
         Index Cond: (gsi_t.b <= 10)
    
    (12 rows)
  • 场景二:多表关联查询。
    gaussdb=# DCREATE TABLE t1(c1 INT, c2 INT, c3 INT, c4 INT, c5 INT) DISTRIBUTE BY HASH(c1);
    CREATE TABLE
    gaussdb=# DINSERT INTO t1 VALUES (GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10));
    INSERT 0 10
    gaussdb=# DCREATE TABLE t2(c1 INT, c2 INT, c3 INT, c4 INT, c5 INT) DISTRIBUTE BY HASH(c1);
    CREATE TABLE
    gaussdb=# DINSERT INTO t2 VALUES (3, GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10));
    INSERT 0 10
    gaussdb=# -- 创建普通索引
    gaussdb=# CREATE INDEX i_t1 ON t1(c2);
    CREATE INDEX
    gaussdb=# -- 两表JOIN,查询使用STREAM计划,未下推
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT t1.c2 FROM t1, t2 WHERE t1.c2 = t2.c1;
                               QUERY PLAN                            
    -----------------------------------------------------------------
     Streaming (type: GATHER)
       Output: t1.c2
       Node/s: All datanodes
       ->  Hash Join
             Output: t1.c2
             Hash Cond: (t2.c1 = t1.c2)
             ->  Seq Scan on public.t2
                   Output: t2.c1, t2.c2, t2.c3, t2.c4, t2.c5
                   Distribute Key: t2.c1
             ->  Hash
                   Output: t1.c2
                   ->  Streaming(type: REDISTRIBUTE)
                         Output: t1.c2
                         Distribute Key: t1.c2
                         Spawn on: All datanodes
                         Consumer Nodes: All datanodes
                         ->  Index Only Scan using i_t1 on public.t1
                               Output: t1.c2
                               Distribute Key: t1.c1
    (19 rows)

    通过在关联列上创建全局二级索引,使查询下推到某个数据节点。

    gaussdb=# -- 创建GSI
    gaussdb=# CREATE GLOBAL INDEX gsi_t1 ON t1(c2);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'c2' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE GLOBAL INDEX
    gaussdb=# -- 两表JOIN,使用GSI下推所有DN
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT t1.c2 FROM t1, t2 WHERE t1.c2 = t2.c1;
                                             QUERY PLAN                                          
    ---------------------------------------------------------------------------------------------
     Data Node Scan
       Output: gsi_t1.c2
       Node/s: All datanodes
       Remote query: SELECT gsi_t1.c2 FROM ONLY public.gsi_t1, public.t2 WHERE gsi_t1.c2 = t2.c1
    
     Remote SQL: SELECT gsi_t1.c2 FROM ONLY public.gsi_t1, public.t2 WHERE gsi_t1.c2 = t2.c1
     Datanode Name: datanode1
       Hash Join
         Output: gsi_t1.c2
         Hash Cond: (t2.c1 = gsi_t1.c2)
         ->  Seq Scan on public.t2
               Output: t2.c1, t2.c2, t2.c3, t2.c4, t2.c5
         ->  Hash
               Output: gsi_t1.c2
               ->  Index Only Scan using gsi_t1 on public.gsi_t1
                     Output: gsi_t1.c2
    
    (17 rows)
  • 场景三:子查询。
    gaussdb=# CREATE TABLE t3(c1 INT, c2 INT, c3 INT, c4 INT, c5 INT) DISTRIBUTE BY HASH(c1);
    CREATE TABLE
    gaussdb=# INSERT INTO t3 VALUES (GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10));
    INSERT 0 10
    gaussdb=# CREATE TABLE t4(c1 INT, c2 INT, c3 INT, c4 INT, c5 INT) DISTRIBUTE BY HASH(c1);
    CREATE TABLE
    gaussdb=# INSERT INTO t4 VALUES (3, GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10), GENERATE_SERIES(1,10));
    INSERT 0 10
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT t4.c2 FROM t4, (SELECT c2 FROM t3) aa WHERE aa.c2 = t4.c1;
                     QUERY PLAN                  
    ---------------------------------------------
     Streaming (type: GATHER)
       Output: t4.c2
       Node/s: All datanodes
       ->  Hash Join
             Output: t4.c2
             Hash Cond: (t3.c2 = t4.c1)
             ->  Streaming(type: REDISTRIBUTE)
                   Output: t3.c2
                   Distribute Key: t3.c2
                   Spawn on: All datanodes
                   Consumer Nodes: All datanodes
                   ->  Seq Scan on public.t3
                         Output: t3.c2
                         Distribute Key: t3.c1
             ->  Hash
                   Output: t4.c2, t4.c1
                   ->  Seq Scan on public.t4
                         Output: t4.c2, t4.c1
                         Distribute Key: t4.c1
    (19 rows)

    通过创建全局二级索引,使查询下推数据节点。此时,查询可能需要增加GSI force hint来使用全局二级索引。

    gaussdb=# CREATE GLOBAL INDEX gsi_t3 ON t3(c2) CONTAINING(c3, c4) DISTRIBUTE BY HASH(c2);
    CREATE GLOBAL INDEX
    gaussdb=# -- 使用GSI下推所有DN,需要使用GSI force hint
    gaussdb=# -- 不适用GSI force hint,无法下推
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT t4.c2 FROM t4, (SELECT c2 FROM t3) aa WHERE aa.c2 = t4.c1;
                             QUERY PLAN                          
    -------------------------------------------------------------
     Streaming (type: GATHER)
       Output: t4.c2
       Node/s: All datanodes
       ->  Hash Join
             Output: t4.c2
             Hash Cond: (t4.c1 = t3.c2)
             ->  Seq Scan on public.t4
                   Output: t4.c1, t4.c2, t4.c3, t4.c4, t4.c5
                   Distribute Key: t4.c1
             ->  Hash
                   Output: t3.c2
                   ->  Index Only Scan using gsi_t3 on public.t3
                         Output: t3.c2
                         Distribute Key: t3.c2
    (14 rows)
    
    gaussdb=# -- 使用GSI force hint,下推所有DN
    gaussdb=# EXPLAIN (VERBOSE ON, COSTS OFF) SELECT /*+gsi()*/ t4.c2 FROM t4, (SELECT c2 FROM t3) aa WHERE aa.c2 = t4.c1;
                                                            QUERY PLAN                                                         
    ---------------------------------------------------------------------------------------------------------------------------
     Data Node Scan
       Output: t4.c2
       Node/s: All datanodes
       Remote query: SELECT/*+ gsi()*/ t4.c2 FROM public.t4, (SELECT gsi_t3.c2 FROM ONLY public.gsi_t3) aa WHERE aa.c2 = t4.c1
    
     Remote SQL: SELECT/*+ gsi()*/ t4.c2 FROM public.t4, (SELECT gsi_t3.c2 FROM ONLY public.gsi_t3) aa WHERE aa.c2 = t4.c1
     Datanode Name: datanode1
       Hash Join
         Output: t4.c2
         Hash Cond: (t4.c1 = gsi_t3.c2)
         ->  Seq Scan on public.t4
               Output: t4.c1, t4.c2, t4.c3, t4.c4, t4.c5
         ->  Hash
               Output: gsi_t3.c2
               ->  Index Only Scan using gsi_t3 on public.gsi_t3
                     Output: gsi_t3.c2
    
    (17 rows)

全局二级索引回表扫描

  • STREAM分布式计划:创建全局二级索引的基表,查询时需要走全局二级索引回表的场景。如果谓词命中的行数比较多,比如范围查询,优先走STREAM分布式计划。示例如下。
    gaussdb=#  -- 开启STEAM模式
    gaussdb=#  SET ENABLE_STREAM_OPERATOR = ON;
    SET
    gaussdb=#  SET EXPLAIN_PERF_MODE=PRETTY;
    SET
    gaussdb=#  CREATE TABLE test(c1 INT, c2 INT, c3 INT);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'c1' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE TABLE
    gaussdb=# -- 为test表在c2列上创建全局二级索引
    gaussdb=#  CREATE GLOBAL INDEX gsi_test ON test(c2);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'c2' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE GLOBAL INDEX
    gaussdb=#  -- 走全局二级索引STREAM分布式计划
    gaussdb=#  EXPLAIN(VERBOSE ON, COSTS OFF) SELECT /*+ gsitable(test gsi_test)*/ * FROM test WHERE c2 > 1;
    WARNING:  Statistics in some tables or columns(public.test.c1, public.test.c2) are not collected.
    HINT:  Do analyze for them in order to generate optimized plan.
     id |                        operation                         
    ----+----------------------------------------------------------
      1 | ->  Streaming (type: GATHER)
      2 |    ->  GSI Tid Scan on public.test
      3 |       ->  Streaming(type: REDISTRIBUTE GSI)
      4 |          ->  GSI Only Scan using gsi_test on public.test
    (4 rows)
       Predicate Information (identified by plan id)   
    ---------------------------------------------------
       4 --GSI Only Scan using gsi_test on public.test
             Index Cond: (test.c2 > 1)
    (2 rows)
      Targetlist Information (identified by plan id)   
    ---------------------------------------------------
       1 --Streaming (type: GATHER)
             Output: c1, c2, c3
             Node/s: All datanodes
       2 --GSI Tid Scan on public.test
             Output: c1, c2, c3
             Distribute Key: c1
       3 --Streaming(type: REDISTRIBUTE GSI)
             Output: xc_node_hash, ctid
             Spawn on: All datanodes
             Consumer Nodes: All datanodes
       4 --GSI Only Scan using gsi_test on public.test
             Output: xc_node_hash, ctid
             Distribute Key: c2
    (13 rows)
  • 非STREAM分布式计划:创建全局二级索引的基表,查询时需要走全局二级索引回表的场景。如果谓词命中的行数比较少,比如点查,优先走非STREAM分布式计划。示例如下。
    gaussdb=#  -- 关闭STEAM模式
    gaussdb=#  SET ENABLE_STREAM_OPERATOR = OFF;
    SET
    gaussdb=#  CREATE TABLE test(c1 INT, c2 INT, c3 INT);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'c1' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE TABLE
    gaussdb=# -- 为test表在c2列上创建全局二级索引
    gaussdb=#  CREATE GLOBAL INDEX gsi_test ON test(c2);
    NOTICE:  The 'DISTRIBUTE BY' clause is not specified. Using 'c2' as the distribution column by default.
    HINT:  Please use 'DISTRIBUTE BY' clause to specify suitable data distribution column.
    CREATE GLOBAL INDEX
    gaussdb=#  -- 走全局二级索引非STREAM分布式计划
    gaussdb=#  EXPLAIN(VERBOSE ON, COSTS OFF) SELECT /*+ gsitable(test gsi_test)*/ * FROM test WHERE c2 = 1;
                                        QUERY PLAN                                     
    -----------------------------------------------------------------------------------
     Select on public.test
       Output: test.c1, test.c2, test.c3
       Node/s: All datanodes
       Remote query: SELECT c1, c2, c3 FROM ONLY public.test WHERE ctid = $2
       ->  Data Node Scan on gsi_test "REMOTE_TABLE_QUERY"
             Output: gsi_test.xc_node_hash, gsi_test.ctid
             Node/s: datanode1
             Remote query: SELECT xc_node_hash, ctid FROM public.gsi_test WHERE c2 = 1
    (8 rows)

相关文档