arm_debug: cycle-exact JTAG layer; system-speed memory read works

Read CPU memory over JTAG via system-speed LDM. Validated on the LPC2103:
reads the real ARM reset vectors and contiguous multi-block code.

The core only advances on Run-Test/Idle debug clocks (not Update-DR), so
the trick is keeping that clock count exact:
- "quiet" TAP ops (quiet_set_ir / quiet_shift_dr / quiet_chain_select /
  quiet_eice_read / quiet_latch_chain1) pass through Update but park in
  Pause, never RTI -> they switch chains and read EmbeddedICE WITHOUT
  clocking the core, so they can't clobber the registers a sys-speed LDM
  just loaded.
- clock_core(n) is the only thing that advances the core (n RTI clocks).
- execute_sys_speed: RESTART, then drive the access one clock at a time
  with a quiet DBG_STATUS check between, stopping the instant
  SYSCOMP & DBGACK appear (no over-clock past re-entry).
- after sys-speed: quiet-switch to chain 1, quiet-latch a NOP to displace
  the stale LDMIA, then read_core_regs.
- pre-read pipeline normalization: change_to_arm (17 clocked instrs) for
  a Thumb halt; 17 ARM NOPs for an ARM halt.

WIP: not yet reliable across all halt states - the first read after some
halts times out (SYSCOMP never appears) and leaves the core running.
Within one good halt, reads are consistent and correct. Diagnosis and
next steps in the arm7-debug-dclk-timing note.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 20:00:51 +02:00
parent 68229339e9
commit 2c16a66beb

View File

@@ -1,4 +1,5 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
@@ -7,17 +8,23 @@
/*
* ARM7TDMI debug over JTAG (EmbeddedICE), built on the bscan_* TAP
* primitives. Incremental bring-up:
* - done: EmbeddedICE register access; halt (force DBGRQ, then debug
* entry = DBGACK|INTDIS) / resume (RESTART); Thumb->ARM switch;
* cycle-exact chain-1 access (one debug clock per access) -> debug-
* speed register read/write validated by a known-pattern round-trip;
* system-speed LDM re-enters debug (DBGACK&SYSCOMP).
* - WIP: the read_core_regs AFTER a system-speed access is phase-
* shifted by the EmbeddedICE<->chain-1 switch clocks, so memory
* reads come back misaligned (capturing injected instructions). See
* the arm7-debug-dclk-timing note.
* - todo: align the post-sys-speed read, memory write, arm_flash.
* primitives. Bring-up state:
* - done: EmbeddedICE register access; halt / resume; Thumb->ARM
* switch; cycle-exact chain-1 access (one debug clock per access);
* debug-speed register read/write (known-pattern round-trip);
* system-speed LDM memory read. The key to memory reads is keeping
* the core's debug-clock count exact: chain switches and EICE reads
* use "quiet" ops that never enter Run-Test/Idle (so they don't clock
* the core and clobber loaded registers), and the sys-speed poll
* drives the access one clock at a time, stopping the instant
* SYSCOMP appears. Validated by reading the LPC2103 reset vectors and
* contiguous multi-block code.
* - WIP: reliability across halt states. The first read after certain
* halts can time out (sys-speed never reaches SYSCOMP), which leaves
* the core running free; subsequent reads in the same halt are
* consistent and correct. Needs a deterministic pre-read pipeline
* normalization for every halt state. See arm7-debug-dclk-timing.
* - todo: read reliability, memory write, arm_flash.
*/
/* ARM7TDMI public JTAG instructions (IR length 4). */
@@ -129,6 +136,184 @@ static int eice_read(jtag_core *jc, int addr, uint32_t *val)
return 0;
}
/* ---- "Quiet" TAP ops: park in Pause, never enter Run-Test/Idle -------
* The debug core advances one step per Run-Test/Idle clock (Update-DR
* alone does NOT clock it). After a system-speed access has re-entered
* debug, any stray RTI clock executes a garbage instruction and clobbers
* the registers the LDM just loaded. These ops pass through Update but
* never RTI, so they move the TAP / shift IR & DR without clocking the
* core. They start from and leave the TAP in a Pause state. */
/* Run-Test/Idle -> Pause-DR (the one transition that leaves RTI; the core
* is either running on MCLK (post-RESTART) or this is a deliberate flush). */
static void quiet_enter(jtag_core *jc)
{
unsigned char tms[4];
tms[0] = JTAG_STR_TMS; /* RTI -> Select-DR */
tms[1] = 0; /* -> Capture-DR */
tms[2] = JTAG_STR_TMS; /* -> Exit1-DR */
tms[3] = 0; /* -> Pause-DR */
jc->io_functions.drv_TX_TMS(jc, tms, 4);
}
/* Pause-* -> ... -> Run-Test/Idle (re-arms the normal RTI-based ops). */
static void quiet_exit(jtag_core *jc)
{
unsigned char tms[3];
tms[0] = JTAG_STR_TMS; /* Pause -> Exit2 */
tms[1] = JTAG_STR_TMS; /* -> Update */
tms[2] = 0; /* -> Run-Test/Idle */
jc->io_functions.drv_TX_TMS(jc, tms, 3);
}
/* Set IR = opcode (len bits), parking in Pause-IR. Starts from any Pause. */
static int quiet_set_ir(jtag_core *jc, unsigned int opcode, int len)
{
unsigned char tms[6], *d;
int i;
/* Pause -> Exit2 -> Update -> Select-DR -> Select-IR -> Capture-IR -> Shift-IR */
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
tms[3] = JTAG_STR_TMS; tms[4] = 0; tms[5] = 0;
jc->io_functions.drv_TX_TMS(jc, tms, 6);
d = malloc(len);
if (!d) return -1;
for (i = 0; i < len; i++) {
d[i] = ((opcode >> i) & 1u) ? JTAG_STR_DOUT : 0;
if (i == len - 1) d[i] |= JTAG_STR_TMS; /* last bit -> Exit1-IR */
}
jc->io_functions.drv_TXRX_DATA(jc, d, NULL, len);
free(d);
tms[0] = 0; /* Exit1-IR -> Pause-IR */
jc->io_functions.drv_TX_TMS(jc, tms, 1);
return 0;
}
/* Shift DR (n bits), parking in Pause-DR. Starts from any Pause. */
static int quiet_shift_dr(jtag_core *jc, const uint8_t *tdi, uint8_t *tdo, int n)
{
unsigned char tms[5], *out, *in = NULL;
int i;
/* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
tms[3] = 0; tms[4] = 0;
jc->io_functions.drv_TX_TMS(jc, tms, 5);
out = malloc(n);
if (!out) return -1;
if (tdo) { in = malloc(n); if (!in) { free(out); return -1; } }
for (i = 0; i < n; i++) {
uint8_t b = tdi ? ((tdi[i / 8] >> (i & 7)) & 1u) : 0;
out[i] = b ? JTAG_STR_DOUT : 0;
if (i == n - 1) out[i] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */
}
jc->io_functions.drv_TXRX_DATA(jc, out, in, n);
if (tdo && in) {
memset(tdo, 0, (size_t)((n + 7) / 8));
for (i = 0; i < n; i++) if (in[i]) tdo[i / 8] |= (uint8_t)(1u << (i & 7));
}
free(out); free(in);
tms[0] = 0; /* Exit1-DR -> Pause-DR */
jc->io_functions.drv_TX_TMS(jc, tms, 1);
return 0;
}
/* Select a scan chain without clocking the core (quiet). Starts from a
* Pause state, ends in Pause-IR (INTEST loaded). */
static int quiet_chain_select(jtag_core *jc, int chain)
{
uint8_t sc = (uint8_t)chain;
if (quiet_set_ir(jc, IR_SCAN_N, ARM7_IR_LEN) < 0) return -1;
if (quiet_shift_dr(jc, &sc, NULL, ARM7_IR_LEN) < 0) return -1;
if (quiet_set_ir(jc, IR_INTEST, ARM7_IR_LEN) < 0) return -1;
return 0;
}
/* Build a 38-bit EmbeddedICE scan-2 frame (data|addr|rw) into buf. */
static void eice_frame(uint8_t buf[5], int addr, int rw, uint32_t data)
{
int i;
memset(buf, 0, 5);
for (i = 0; i < 32; i++)
if (data & (1u << i)) buf[i >> 3] |= (uint8_t)(1u << (i & 7));
for (i = 0; i < 5; i++)
if (addr & (1 << i)) { int b = 32 + i; buf[b >> 3] |= (uint8_t)(1u << (b & 7)); }
if (rw) buf[37 >> 3] |= (uint8_t)(1u << (37 & 7));
}
/* Latch an instruction into the scan-chain-1 instruction register
* WITHOUT clocking the core: shift the 33-bit frame, pass through
* Update-DR (latches the parallel output) but never Run-Test/Idle. Used
* to displace a stale instruction (e.g. the system-speed LDMIA) that
* would otherwise be re-executed and clobber registers on the next
* debug clock. Chain 1 + INTEST must be selected; starts/ends in Pause. */
static int quiet_latch_chain1(jtag_core *jc, uint32_t instr)
{
unsigned char tms[6], out[33];
uint32_t f = flip32(instr);
int i;
/* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
tms[3] = 0; tms[4] = 0;
jc->io_functions.drv_TX_TMS(jc, tms, 5);
out[0] = 0; /* no SYSSPEED */
for (i = 0; i < 32; i++)
out[1 + i] = (f & (1u << i)) ? JTAG_STR_DOUT : 0;
out[32] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */
jc->io_functions.drv_TXRX_DATA(jc, out, NULL, 33);
/* Exit1 -> Update-DR (latch, no clock) -> Select-DR -> Capture-DR
* -> Exit1 -> Pause-DR */
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = 0;
tms[3] = JTAG_STR_TMS; tms[4] = 0;
jc->io_functions.drv_TX_TMS(jc, tms, 5);
return 0;
}
/* Clock the debug core exactly n steps: enter Run-Test/Idle for n TCKs,
* then return to a Pause state. This is the ONLY thing that advances the
* core, so callers control the debug-clock count precisely (the chain
* switches and EICE reads above never enter RTI, hence never clock it).
* Starts from any Pause state, ends in Pause-DR. */
static void clock_core(jtag_core *jc, int n)
{
unsigned char *tms;
int i, k = 0;
if (n < 0) n = 0;
tms = malloc((size_t)(n + 8));
if (!tms) return;
tms[k++] = JTAG_STR_TMS; /* Pause -> Exit2 */
tms[k++] = JTAG_STR_TMS; /* -> Update */
tms[k++] = 0; /* -> Run-Test/Idle (1st in-RTI clock) */
for (i = 1; i < n; i++) tms[k++] = 0; /* dwell: n total RTI clocks */
tms[k++] = JTAG_STR_TMS; /* RTI -> Select-DR */
tms[k++] = 0; /* -> Capture-DR */
tms[k++] = JTAG_STR_TMS; /* -> Exit1-DR */
tms[k++] = 0; /* -> Pause-DR */
jc->io_functions.drv_TX_TMS(jc, tms, k);
free(tms);
}
/* Read an EmbeddedICE register without clocking the core (quiet). The
* EmbeddedICE chain (#2) must already be selected via quiet ops. */
static int quiet_eice_read(jtag_core *jc, int addr, uint32_t *val)
{
uint8_t buf[5], cap[5];
uint32_t v = 0;
int i;
eice_frame(buf, addr, 0, 0);
if (quiet_shift_dr(jc, buf, NULL, 38) < 0) return -1; /* request */
if (quiet_shift_dr(jc, buf, cap, 38) < 0) return -1; /* capture */
for (i = 0; i < 32; i++)
if (cap[i >> 3] & (1u << (i & 7))) v |= (1u << i);
*val = v;
return 0;
}
int arm_debug_halt(jtag_core *jc, const jtag_target *t)
{
uint32_t status = 0;
@@ -318,16 +503,21 @@ static int execute_sys_speed(jtag_core *jc)
uint32_t status = 0;
int tries;
/* RESTART resumes the core to run the one system-speed access. The
* core needs debug clocks to step through it, but once it re-enters
* debug any further clock executes a stale instruction and clobbers
* the loaded registers. So drive it ONE clock at a time and check
* DBG_STATUS QUIETLY (no clock) between, stopping the instant
* SYSCOMP & DBGACK appear. Leave the TAP parked (Pause) on EICE. */
if (bscan_set_ir(jc, IR_RESTART, ARM7_IR_LEN) < 0) return -1;
/* Poll DBG_STATUS; the EmbeddedICE scans clock the core enough to
* complete the one system-speed access and re-enter debug. (Matches
* OpenOCD: RESTART then poll, no runtest burst in between.) */
if (eice_select(jc) < 0) return -1;
for (tries = 0; tries < 100; tries++) {
if (eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1;
quiet_enter(jc);
if (quiet_chain_select(jc, SC_EICE) < 0) return -1;
for (tries = 0; tries < 2000; tries++) {
if (quiet_eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1;
if ((status & DBG_STATUS_DBGACK) && (status & DBG_STATUS_SYSCOMP))
return 0;
clock_core(jc, 1);
}
fprintf(stderr, "arm_debug: sys-speed access timed out (status 0x%08x)\n", status);
return -1;
@@ -338,11 +528,10 @@ static int execute_sys_speed(jtag_core *jc)
* Core registers r0..r14 are clobbered (acceptable for a read-then-
* power-cycle flow). The core must already be halted (DBGACK).
*
* WORK IN PROGRESS: chain-1 is now cycle-exact (register read/write
* round-trips) and the system-speed LDM re-enters debug, but the
* read_core_regs that follows execute_sys_speed is phase-shifted by the
* EmbeddedICE<->chain-1 switch, so the returned words are misaligned.
* See the arm7-debug-dclk-timing design note. */
* Reads real memory correctly (validated: LPC2103 vectors + multi-block
* code). WORK IN PROGRESS: not yet reliable across all halt states - the
* first read after some halts times out and leaves the core running. See
* the arm7-debug-dclk-timing design note. */
int arm_debug_mem_read(jtag_core *jc, const jtag_target *t,
unsigned long addr, void *buf, unsigned long len)
{
@@ -364,11 +553,21 @@ int arm_debug_mem_read(jtag_core *jc, const jtag_target *t,
if (chain_select(jc, SC_EICE) < 0) return -1;
if (eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1;
if (chain_select(jc, SC_DEBUG) < 0) return -1;
/* Normalize the core to a known ARM pipeline state regardless of the
* halt state. In Thumb, change_to_arm (17 clocked instructions)
* switches to ARM and flushes the firmware out of the pipeline; the
* read alignment is tuned for that. In ARM, run the same NUMBER of
* clocked NOPs so the read sees the same pipeline phase (skipping it
* left the firmware's arbitrary pipeline and the read misaligned). */
c1_init(&c1, jc);
if (status & DBG_STATUS_ITBIT) {
c1_init(&c1, jc);
if (change_to_arm(&c1) < 0) return -1;
c1_end(&c1);
} else {
int k;
for (k = 0; k < 17; k++)
if (c1_xfer(&c1, ARM_NOP, 0, NULL) < 0) return -1;
}
c1_end(&c1);
r0 = (uint32_t)base;
@@ -382,17 +581,28 @@ int arm_debug_mem_read(jtag_core *jc, const jtag_target *t,
/* r1..rn, base (r0) excluded so it can be the autoincrement ptr. */
reg_list = (uint32_t)((0xffffu >> (15 - n)) & 0xfffe);
if (chain_select(jc, SC_DEBUG) < 0) return -1;
/* On chain 1 (from change_to_arm or the previous read): set r0
* once, then queue the system-speed LDM. */
c1_init(&c1, jc);
if (done == 0) /* set r0 once; LDM writeback advances it */
if (done == 0) /* LDM writeback advances r0 after */
if (write_core_regs(&c1, 0, 0x1, &r0) < 0) return -1;
if (load_word_regs(&c1, reg_list) < 0) return -1;
c1_end(&c1);
/* RESTART + quiet poll; leaves the TAP parked on EmbeddedICE. */
if (execute_sys_speed(jc) < 0) return -1;
/* execute_sys_speed left us on the EmbeddedICE chain. */
if (chain_select(jc, SC_DEBUG) < 0) return -1;
/* Switch back to chain 1 WITHOUT clocking the core (a normal
* chain_select would clobber the just-loaded registers), then
* read them out. execute_sys_speed left us parked (Pause) on the
* EmbeddedICE chain. */
if (quiet_chain_select(jc, SC_DEBUG) < 0) return -1;
/* Displace the stale (system-speed LDMIA) instruction with a NOP
* so the first debug clock re-executes a NOP, not the LDMIA
* (which would reload r1..rn from the debug bus and lose the
* memory data). */
if (quiet_latch_chain1(jc, ARM_NOP) < 0) return -1;
quiet_exit(jc);
memset(regs, 0, sizeof(regs));
c1_init(&c1, jc);
if (read_core_regs(&c1, 0, reg_list, regs) < 0) return -1;