本章节主要描述GCC后端常见的几种优化。
FIRST_VIRTUAL_REGISTER
和LAST_VIRTUAL_REGISTER
用于定义伪寄存器,宏REG_ALLOC_ORDER
用来定义寄存器的分配顺序Short * reg_renumber
用于映射伪寄存器到真实寄存器,会被local allocator
和global allocator
多次修改流水线特征包括了定义指令latency、保留执行单元、指令对之间的latency、定义不同的流水单元等。
(define_automaton names)
(define_insn_reservation insn_name default_latency condtion regexp)
(define_insn_reservation "generic_alu" 1
(and (eq_attr "tune" "generic")
(eq_attr "type" "unknown,const,arith,shift,slt,multi,auipc,nop,logical,move,bitmanip,min,max,minu,maxu,clz,ctz,cpop"))
"alu")
(define_insn_reservation "generic_load" 3
(and (eq_attr "tune" "generic")
(eq_attr "type" "load,fpload"))
"alu")
- insn_name 和define_bypass 相关
- 针对不同的target,TARGET_SCHED_ADJUST_COST用于改变默认的latency
- condition定义什么样的RTL指令适用该规则
- regexp描述该指令用于什么样的运算单元
(define_bypass number out_insn_name in_insn_name [guard])
(define_bypass 1 "cpu1_load_*, cpu1_store_*" "cpu1_load_*")
- NUMBER 定义了两条指令对之间的latency,其反应了一条指令in_insn_name 需要等待另一条指令out_insn_name 的结果产生
- guard是一个可选字符串,给出了一个C函数名称,该函数定义了一个附加的处理过程
(define_cpu_unit unit_name)
(define_cpu_unit "sifive_7_A" "sifive_7")
(define_cpu_unit "sifive_7_B" "sifive_7")
(define_insn_reservation "sifive_7_load" 3
(and (eq_attr "tune" "sifive_7")
(eq_attr "type" "load"))
"sifive_7_A")
(define_insn_reservation "sifive_7_branch" 1
(and (eq_attr "tune" "sifive_7")
(eq_attr "type" "branch"))
"sifive_7_B")
这里定义的流水单元结合define_insn_reservation
让指令绑定在特定的单元上,并且可以指定多个单元
考虑到一个超标量CPU可以发出三条指令(两个整数insn和一个浮点insn),但只能完成两个insn。对此定义如下的功能单元
(define_cpu_unit "i0_pipeline, i1_pipeline, f_pipeline")
(define_cpu_unit "port0, port1")
(define_delay test [delay-1 annul-true-1 annul-false-1 delay-2 annul-true-2 annul-false-2 ...] )
pass_sms
,pass_sched
,pass_sched2
和pass_sched_fusion
是GCC中的四种指令调度pass,后面三个pass的默认核心函数都是schedule_insn()
void schedule_insns (void) {
rgn_setup_common_sched_info ();
rgn_setup_sched_infos ();
......
/* Schedule every region in the subroutine. */
for (rgn = 0; rgn < nr_regions; rgn++)
if (dbg_cnt (sched_region))
schedule_region (rgn);
......
}
在调度过程中,一条指令通常处于四种状态中的一种:
schedule_block' when the best insn to schedule is chosen. The transitions (P->R and P->Q) are implemented in
schedule_insn’ as insns move from the ready list to the scheduled list.(genattrtab.cc)包含查询某些内部属性的函数集合,如Tune属性:
/* Check that attribute NAME is used in define_insn_reservation condition
EXP. Return true if it is. */
static bool check_tune_attr (const char *name, rtx exp);
/* Try to find a const attribute (usually cpu or tune) that is used
in all define_insn_reservation conditions. */
static bool find_tune_attr (rtx exp);
genautomata.cc
包含了处理define_automaton
和define_insn_reservation
等模板的函数集合:
/* The function creates DFA(s) for fast pipeline hazards recognition
after checking and simplifying IR of the description. */
static void create_automata (void);
/* Process a DEFINE_INSN_RESERVATION.
This gives information about the reservation of cpu units by an
insn. We fill a struct insn_reserv_decl with information used
later by `expand_automata'. */
static void gen_insn_reserv (md_rtx_info *info);
......
#define GET_CPU_UNIT_CODE_FUNC_NAME "get_cpu_unit_code"
#define CPU_UNIT_RESERVATION_P_FUNC_NAME "cpu_unit_reservation_p"
#define INSN_HAS_DFA_RESERVATION_P_FUNC_NAME "insn_has_dfa_reservation_p"
......
(sched-deps.cc)
/* Find a dependency between producer PRO and consumer CON.
Use dependency [if available] to check if dependency is present at all.
Search through resolved dependency lists if RESOLVED_P is true.
If the dependency or NULL if none found. */
dep_t
sd_find_dep_between (rtx pro, rtx con, bool resolved_p)
{
if (true_dependency_cache != NULL)
/* Avoiding the list walk below can cut compile times dramatically
for some code. */
{
int elem_luid = INSN_LUID (pro);
int insn_luid = INSN_LUID (con);
if (!bitmap_bit_p (&true_dependency_cache[insn_luid], elem_luid)
&& !bitmap_bit_p (&output_dependency_cache[insn_luid], elem_luid)
&& !bitmap_bit_p (&anti_dependency_cache[insn_luid], elem_luid)
&& !bitmap_bit_p (&control_dependency_cache[insn_luid], elem_luid))
return NULL;
}
return sd_find_dep_between_no_cache (pro, con, resolved_p, NULL);
}
/* This records the actual schedule. It is built up during the main phase
of schedule_block, and afterwards used to reorder the insns in the RTL. */
static vec<rtx_insn *> scheduled_insns;
/* A structure that holds local state for the loop in schedule_block. */
struct sched_block_state
{
/* True if no real insns have been scheduled in the current cycle. */
bool first_cycle_insn_p;
/* True if a shadow insn has been scheduled in the current cycle, which
means that no more normal insns can be issued. */
bool shadows_only_p;
/* True if we're winding down a modulo schedule, which means that we only
issue insns with INSN_EXACT_TICK set. */
bool modulo_epilogue;
/* Initialized with the machine's issue rate every cycle, and updated
by calls to the variable_issue hook. */
int can_issue_more;
};
GCC中的优化总体来说比较规整,定义也都比较方便(普遍是可以通过宏定义来完成)。