diff --git a/src/modules/arm_debug/arm_debug.c b/src/modules/arm_debug/arm_debug.c index 51f3396..70f1529 100644 --- a/src/modules/arm_debug/arm_debug.c +++ b/src/modules/arm_debug/arm_debug.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -7,17 +8,23 @@ /* * ARM7TDMI debug over JTAG (EmbeddedICE), built on the bscan_* TAP - * primitives. Incremental bring-up: - * - done: EmbeddedICE register access; halt (force DBGRQ, then debug - * entry = DBGACK|INTDIS) / resume (RESTART); Thumb->ARM switch; - * cycle-exact chain-1 access (one debug clock per access) -> debug- - * speed register read/write validated by a known-pattern round-trip; - * system-speed LDM re-enters debug (DBGACK&SYSCOMP). - * - WIP: the read_core_regs AFTER a system-speed access is phase- - * shifted by the EmbeddedICE<->chain-1 switch clocks, so memory - * reads come back misaligned (capturing injected instructions). See - * the arm7-debug-dclk-timing note. - * - todo: align the post-sys-speed read, memory write, arm_flash. + * primitives. Bring-up state: + * - done: EmbeddedICE register access; halt / resume; Thumb->ARM + * switch; cycle-exact chain-1 access (one debug clock per access); + * debug-speed register read/write (known-pattern round-trip); + * system-speed LDM memory read. The key to memory reads is keeping + * the core's debug-clock count exact: chain switches and EICE reads + * use "quiet" ops that never enter Run-Test/Idle (so they don't clock + * the core and clobber loaded registers), and the sys-speed poll + * drives the access one clock at a time, stopping the instant + * SYSCOMP appears. Validated by reading the LPC2103 reset vectors and + * contiguous multi-block code. + * - WIP: reliability across halt states. The first read after certain + * halts can time out (sys-speed never reaches SYSCOMP), which leaves + * the core running free; subsequent reads in the same halt are + * consistent and correct. Needs a deterministic pre-read pipeline + * normalization for every halt state. See arm7-debug-dclk-timing. + * - todo: read reliability, memory write, arm_flash. */ /* ARM7TDMI public JTAG instructions (IR length 4). */ @@ -129,6 +136,184 @@ static int eice_read(jtag_core *jc, int addr, uint32_t *val) return 0; } +/* ---- "Quiet" TAP ops: park in Pause, never enter Run-Test/Idle ------- + * The debug core advances one step per Run-Test/Idle clock (Update-DR + * alone does NOT clock it). After a system-speed access has re-entered + * debug, any stray RTI clock executes a garbage instruction and clobbers + * the registers the LDM just loaded. These ops pass through Update but + * never RTI, so they move the TAP / shift IR & DR without clocking the + * core. They start from and leave the TAP in a Pause state. */ + +/* Run-Test/Idle -> Pause-DR (the one transition that leaves RTI; the core + * is either running on MCLK (post-RESTART) or this is a deliberate flush). */ +static void quiet_enter(jtag_core *jc) +{ + unsigned char tms[4]; + tms[0] = JTAG_STR_TMS; /* RTI -> Select-DR */ + tms[1] = 0; /* -> Capture-DR */ + tms[2] = JTAG_STR_TMS; /* -> Exit1-DR */ + tms[3] = 0; /* -> Pause-DR */ + jc->io_functions.drv_TX_TMS(jc, tms, 4); +} + +/* Pause-* -> ... -> Run-Test/Idle (re-arms the normal RTI-based ops). */ +static void quiet_exit(jtag_core *jc) +{ + unsigned char tms[3]; + tms[0] = JTAG_STR_TMS; /* Pause -> Exit2 */ + tms[1] = JTAG_STR_TMS; /* -> Update */ + tms[2] = 0; /* -> Run-Test/Idle */ + jc->io_functions.drv_TX_TMS(jc, tms, 3); +} + +/* Set IR = opcode (len bits), parking in Pause-IR. Starts from any Pause. */ +static int quiet_set_ir(jtag_core *jc, unsigned int opcode, int len) +{ + unsigned char tms[6], *d; + int i; + /* Pause -> Exit2 -> Update -> Select-DR -> Select-IR -> Capture-IR -> Shift-IR */ + tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS; + tms[3] = JTAG_STR_TMS; tms[4] = 0; tms[5] = 0; + jc->io_functions.drv_TX_TMS(jc, tms, 6); + + d = malloc(len); + if (!d) return -1; + for (i = 0; i < len; i++) { + d[i] = ((opcode >> i) & 1u) ? JTAG_STR_DOUT : 0; + if (i == len - 1) d[i] |= JTAG_STR_TMS; /* last bit -> Exit1-IR */ + } + jc->io_functions.drv_TXRX_DATA(jc, d, NULL, len); + free(d); + + tms[0] = 0; /* Exit1-IR -> Pause-IR */ + jc->io_functions.drv_TX_TMS(jc, tms, 1); + return 0; +} + +/* Shift DR (n bits), parking in Pause-DR. Starts from any Pause. */ +static int quiet_shift_dr(jtag_core *jc, const uint8_t *tdi, uint8_t *tdo, int n) +{ + unsigned char tms[5], *out, *in = NULL; + int i; + /* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */ + tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS; + tms[3] = 0; tms[4] = 0; + jc->io_functions.drv_TX_TMS(jc, tms, 5); + + out = malloc(n); + if (!out) return -1; + if (tdo) { in = malloc(n); if (!in) { free(out); return -1; } } + for (i = 0; i < n; i++) { + uint8_t b = tdi ? ((tdi[i / 8] >> (i & 7)) & 1u) : 0; + out[i] = b ? JTAG_STR_DOUT : 0; + if (i == n - 1) out[i] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */ + } + jc->io_functions.drv_TXRX_DATA(jc, out, in, n); + if (tdo && in) { + memset(tdo, 0, (size_t)((n + 7) / 8)); + for (i = 0; i < n; i++) if (in[i]) tdo[i / 8] |= (uint8_t)(1u << (i & 7)); + } + free(out); free(in); + + tms[0] = 0; /* Exit1-DR -> Pause-DR */ + jc->io_functions.drv_TX_TMS(jc, tms, 1); + return 0; +} + +/* Select a scan chain without clocking the core (quiet). Starts from a + * Pause state, ends in Pause-IR (INTEST loaded). */ +static int quiet_chain_select(jtag_core *jc, int chain) +{ + uint8_t sc = (uint8_t)chain; + if (quiet_set_ir(jc, IR_SCAN_N, ARM7_IR_LEN) < 0) return -1; + if (quiet_shift_dr(jc, &sc, NULL, ARM7_IR_LEN) < 0) return -1; + if (quiet_set_ir(jc, IR_INTEST, ARM7_IR_LEN) < 0) return -1; + return 0; +} + +/* Build a 38-bit EmbeddedICE scan-2 frame (data|addr|rw) into buf. */ +static void eice_frame(uint8_t buf[5], int addr, int rw, uint32_t data) +{ + int i; + memset(buf, 0, 5); + for (i = 0; i < 32; i++) + if (data & (1u << i)) buf[i >> 3] |= (uint8_t)(1u << (i & 7)); + for (i = 0; i < 5; i++) + if (addr & (1 << i)) { int b = 32 + i; buf[b >> 3] |= (uint8_t)(1u << (b & 7)); } + if (rw) buf[37 >> 3] |= (uint8_t)(1u << (37 & 7)); +} + +/* Latch an instruction into the scan-chain-1 instruction register + * WITHOUT clocking the core: shift the 33-bit frame, pass through + * Update-DR (latches the parallel output) but never Run-Test/Idle. Used + * to displace a stale instruction (e.g. the system-speed LDMIA) that + * would otherwise be re-executed and clobber registers on the next + * debug clock. Chain 1 + INTEST must be selected; starts/ends in Pause. */ +static int quiet_latch_chain1(jtag_core *jc, uint32_t instr) +{ + unsigned char tms[6], out[33]; + uint32_t f = flip32(instr); + int i; + + /* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */ + tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS; + tms[3] = 0; tms[4] = 0; + jc->io_functions.drv_TX_TMS(jc, tms, 5); + + out[0] = 0; /* no SYSSPEED */ + for (i = 0; i < 32; i++) + out[1 + i] = (f & (1u << i)) ? JTAG_STR_DOUT : 0; + out[32] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */ + jc->io_functions.drv_TXRX_DATA(jc, out, NULL, 33); + + /* Exit1 -> Update-DR (latch, no clock) -> Select-DR -> Capture-DR + * -> Exit1 -> Pause-DR */ + tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = 0; + tms[3] = JTAG_STR_TMS; tms[4] = 0; + jc->io_functions.drv_TX_TMS(jc, tms, 5); + return 0; +} + +/* Clock the debug core exactly n steps: enter Run-Test/Idle for n TCKs, + * then return to a Pause state. This is the ONLY thing that advances the + * core, so callers control the debug-clock count precisely (the chain + * switches and EICE reads above never enter RTI, hence never clock it). + * Starts from any Pause state, ends in Pause-DR. */ +static void clock_core(jtag_core *jc, int n) +{ + unsigned char *tms; + int i, k = 0; + if (n < 0) n = 0; + tms = malloc((size_t)(n + 8)); + if (!tms) return; + tms[k++] = JTAG_STR_TMS; /* Pause -> Exit2 */ + tms[k++] = JTAG_STR_TMS; /* -> Update */ + tms[k++] = 0; /* -> Run-Test/Idle (1st in-RTI clock) */ + for (i = 1; i < n; i++) tms[k++] = 0; /* dwell: n total RTI clocks */ + tms[k++] = JTAG_STR_TMS; /* RTI -> Select-DR */ + tms[k++] = 0; /* -> Capture-DR */ + tms[k++] = JTAG_STR_TMS; /* -> Exit1-DR */ + tms[k++] = 0; /* -> Pause-DR */ + jc->io_functions.drv_TX_TMS(jc, tms, k); + free(tms); +} + +/* Read an EmbeddedICE register without clocking the core (quiet). The + * EmbeddedICE chain (#2) must already be selected via quiet ops. */ +static int quiet_eice_read(jtag_core *jc, int addr, uint32_t *val) +{ + uint8_t buf[5], cap[5]; + uint32_t v = 0; + int i; + eice_frame(buf, addr, 0, 0); + if (quiet_shift_dr(jc, buf, NULL, 38) < 0) return -1; /* request */ + if (quiet_shift_dr(jc, buf, cap, 38) < 0) return -1; /* capture */ + for (i = 0; i < 32; i++) + if (cap[i >> 3] & (1u << (i & 7))) v |= (1u << i); + *val = v; + return 0; +} + int arm_debug_halt(jtag_core *jc, const jtag_target *t) { uint32_t status = 0; @@ -318,16 +503,21 @@ static int execute_sys_speed(jtag_core *jc) uint32_t status = 0; int tries; + /* RESTART resumes the core to run the one system-speed access. The + * core needs debug clocks to step through it, but once it re-enters + * debug any further clock executes a stale instruction and clobbers + * the loaded registers. So drive it ONE clock at a time and check + * DBG_STATUS QUIETLY (no clock) between, stopping the instant + * SYSCOMP & DBGACK appear. Leave the TAP parked (Pause) on EICE. */ if (bscan_set_ir(jc, IR_RESTART, ARM7_IR_LEN) < 0) return -1; - /* Poll DBG_STATUS; the EmbeddedICE scans clock the core enough to - * complete the one system-speed access and re-enter debug. (Matches - * OpenOCD: RESTART then poll, no runtest burst in between.) */ - if (eice_select(jc) < 0) return -1; - for (tries = 0; tries < 100; tries++) { - if (eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1; + quiet_enter(jc); + if (quiet_chain_select(jc, SC_EICE) < 0) return -1; + for (tries = 0; tries < 2000; tries++) { + if (quiet_eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1; if ((status & DBG_STATUS_DBGACK) && (status & DBG_STATUS_SYSCOMP)) return 0; + clock_core(jc, 1); } fprintf(stderr, "arm_debug: sys-speed access timed out (status 0x%08x)\n", status); return -1; @@ -338,11 +528,10 @@ static int execute_sys_speed(jtag_core *jc) * Core registers r0..r14 are clobbered (acceptable for a read-then- * power-cycle flow). The core must already be halted (DBGACK). * - * WORK IN PROGRESS: chain-1 is now cycle-exact (register read/write - * round-trips) and the system-speed LDM re-enters debug, but the - * read_core_regs that follows execute_sys_speed is phase-shifted by the - * EmbeddedICE<->chain-1 switch, so the returned words are misaligned. - * See the arm7-debug-dclk-timing design note. */ + * Reads real memory correctly (validated: LPC2103 vectors + multi-block + * code). WORK IN PROGRESS: not yet reliable across all halt states - the + * first read after some halts times out and leaves the core running. See + * the arm7-debug-dclk-timing design note. */ int arm_debug_mem_read(jtag_core *jc, const jtag_target *t, unsigned long addr, void *buf, unsigned long len) { @@ -364,11 +553,21 @@ int arm_debug_mem_read(jtag_core *jc, const jtag_target *t, if (chain_select(jc, SC_EICE) < 0) return -1; if (eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1; if (chain_select(jc, SC_DEBUG) < 0) return -1; + /* Normalize the core to a known ARM pipeline state regardless of the + * halt state. In Thumb, change_to_arm (17 clocked instructions) + * switches to ARM and flushes the firmware out of the pipeline; the + * read alignment is tuned for that. In ARM, run the same NUMBER of + * clocked NOPs so the read sees the same pipeline phase (skipping it + * left the firmware's arbitrary pipeline and the read misaligned). */ + c1_init(&c1, jc); if (status & DBG_STATUS_ITBIT) { - c1_init(&c1, jc); if (change_to_arm(&c1) < 0) return -1; - c1_end(&c1); + } else { + int k; + for (k = 0; k < 17; k++) + if (c1_xfer(&c1, ARM_NOP, 0, NULL) < 0) return -1; } + c1_end(&c1); r0 = (uint32_t)base; @@ -382,17 +581,28 @@ int arm_debug_mem_read(jtag_core *jc, const jtag_target *t, /* r1..rn, base (r0) excluded so it can be the autoincrement ptr. */ reg_list = (uint32_t)((0xffffu >> (15 - n)) & 0xfffe); - if (chain_select(jc, SC_DEBUG) < 0) return -1; + /* On chain 1 (from change_to_arm or the previous read): set r0 + * once, then queue the system-speed LDM. */ c1_init(&c1, jc); - if (done == 0) /* set r0 once; LDM writeback advances it */ + if (done == 0) /* LDM writeback advances r0 after */ if (write_core_regs(&c1, 0, 0x1, &r0) < 0) return -1; if (load_word_regs(&c1, reg_list) < 0) return -1; c1_end(&c1); + /* RESTART + quiet poll; leaves the TAP parked on EmbeddedICE. */ if (execute_sys_speed(jc) < 0) return -1; - /* execute_sys_speed left us on the EmbeddedICE chain. */ - if (chain_select(jc, SC_DEBUG) < 0) return -1; + /* Switch back to chain 1 WITHOUT clocking the core (a normal + * chain_select would clobber the just-loaded registers), then + * read them out. execute_sys_speed left us parked (Pause) on the + * EmbeddedICE chain. */ + if (quiet_chain_select(jc, SC_DEBUG) < 0) return -1; + /* Displace the stale (system-speed LDMIA) instruction with a NOP + * so the first debug clock re-executes a NOP, not the LDMIA + * (which would reload r1..rn from the debug bus and lose the + * memory data). */ + if (quiet_latch_chain1(jc, ARM_NOP) < 0) return -1; + quiet_exit(jc); memset(regs, 0, sizeof(regs)); c1_init(&c1, jc); if (read_core_regs(&c1, 0, reg_list, regs) < 0) return -1;