cpu_read dumped the LPC2103's full 32 KB flash to Intel HEX (objcopy-verified: all records/checksums valid, correct vectors). Update the comments to reflect the working state and the power-on -> one halt -> dump flow (context save/restore for repeated reads is the next step). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
673 lines
25 KiB
C
673 lines
25 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
#include "bscan/bscan.h"
|
|
#include "arm_debug.h"
|
|
|
|
/*
|
|
* ARM7TDMI debug over JTAG (EmbeddedICE), built on the bscan_* TAP
|
|
* primitives. Bring-up state:
|
|
* - done: EmbeddedICE register access; halt / resume; Thumb->ARM
|
|
* switch; cycle-exact chain-1 access (one debug clock per access);
|
|
* debug-speed register read/write (known-pattern round-trip);
|
|
* system-speed LDM memory read. The key to memory reads is keeping
|
|
* the core's debug-clock count exact: chain switches and EICE reads
|
|
* use "quiet" ops that never enter Run-Test/Idle (so they don't clock
|
|
* the core and clobber loaded registers), and the sys-speed poll
|
|
* drives the access one clock at a time, stopping the instant
|
|
* SYSCOMP appears. Validated by dumping the LPC2103's full 32 KB
|
|
* flash to Intel HEX (objcopy-verified, correct vectors + code).
|
|
* - caveat: the read clobbers r0..r14 and there is no context
|
|
* save/restore, so the intended flow is power-on -> one halt -> dump.
|
|
* Repeated halt/read cycles without a power-cycle degrade (a later
|
|
* re-halt of the clobbered core is messy and may time out).
|
|
* - todo: context save/restore (clean resume + repeated reads),
|
|
* memory write, arm_flash.
|
|
*/
|
|
|
|
/* ARM7TDMI public JTAG instructions (IR length 4). */
|
|
#define ARM7_IR_LEN 4
|
|
#define IR_SCAN_N 0x2
|
|
#define IR_INTEST 0xC
|
|
#define IR_RESTART 0x4
|
|
|
|
/* Scan chains: #1 = debug (instruction/data bus), #2 = EmbeddedICE. */
|
|
#define SC_DEBUG 1
|
|
#define SC_EICE 2
|
|
|
|
/* EmbeddedICE register addresses. */
|
|
#define EICE_DBG_CTRL 0x00
|
|
#define EICE_DBG_STATUS 0x01
|
|
|
|
/* EmbeddedICE Debug Control register bits (write). */
|
|
#define DBG_CTRL_DBGACK (1u << 0) /* force DBGACK */
|
|
#define DBG_CTRL_DBGRQ (1u << 1) /* request debug entry */
|
|
#define DBG_CTRL_INTDIS (1u << 2) /* disable interrupts in debug */
|
|
|
|
/* EmbeddedICE Debug Status register bits (read). */
|
|
#define DBG_STATUS_DBGACK (1u << 0)
|
|
#define DBG_STATUS_SYSCOMP (1u << 3)
|
|
#define DBG_STATUS_ITBIT (1u << 4) /* core was in Thumb state */
|
|
|
|
/* ARMv4 opcodes used for register/memory access via instruction
|
|
* injection (see ARM7TDMI TRM, debug chapter; mirrors OpenOCD). */
|
|
#define ARM_NOP 0xe1a08008u /* mov r8, r8 */
|
|
#define ARM_STMIA(rn, list, w) (0xe8800000u | ((unsigned)(w) << 21) | ((unsigned)(rn) << 16) | (unsigned)(list))
|
|
#define ARM_LDMIA(rn, list, w) (0xe8900000u | ((unsigned)(w) << 21) | ((unsigned)(rn) << 16) | (unsigned)(list))
|
|
|
|
/* Thumb opcodes (16-bit, duplicated into both halfwords as the debug
|
|
* data bus presents them) used only to switch a Thumb-state core to ARM
|
|
* state on debug entry. */
|
|
#define THUMB_DUP(op) ((unsigned)(op) | ((unsigned)(op) << 16))
|
|
#define ARM_T_NOP THUMB_DUP(0x46c0) /* mov r8, r8 */
|
|
#define ARM_T_STR(rd, rn) THUMB_DUP(0x6000 | (rd) | ((rn) << 3))
|
|
#define ARM_T_MOV(rd, rm) THUMB_DUP(0x4600 | ((rd) & 0x7) | (((rd) & 0x8) << 4) | \
|
|
(((rm) & 0x7) << 3) | (((rm) & 0x8) << 3))
|
|
#define ARM_T_LDR_PCREL(rd) THUMB_DUP(0x4800 | ((rd) << 8))
|
|
#define ARM_T_BX(rm) THUMB_DUP(0x4700 | ((rm) << 3))
|
|
|
|
/* Reverse the 32 bits of a word. Scan chain 1 shifts instructions and
|
|
* data with the bit order flipped (TRM); match OpenOCD's flip_u32. */
|
|
static uint32_t flip32(uint32_t v)
|
|
{
|
|
v = ((v & 0xFFFF0000u) >> 16) | ((v & 0x0000FFFFu) << 16);
|
|
v = ((v & 0xFF00FF00u) >> 8) | ((v & 0x00FF00FFu) << 8);
|
|
v = ((v & 0xF0F0F0F0u) >> 4) | ((v & 0x0F0F0F0Fu) << 4);
|
|
v = ((v & 0xCCCCCCCCu) >> 2) | ((v & 0x33333333u) << 2);
|
|
v = ((v & 0xAAAAAAAAu) >> 1) | ((v & 0x55555555u) << 1);
|
|
return v;
|
|
}
|
|
|
|
/* One EmbeddedICE scan-chain-2 access: 38 bits LSB-first =
|
|
* data[0..31] | address[32..36] | read/write[37] (1 = write). On a read,
|
|
* the captured data belongs to the *previously* addressed register. */
|
|
static int eice_scan(jtag_core *jc, int addr, int rw, uint32_t data, uint32_t *out)
|
|
{
|
|
uint8_t buf[5], cap[5];
|
|
int i, r;
|
|
|
|
memset(buf, 0, sizeof(buf));
|
|
for (i = 0; i < 32; i++)
|
|
if (data & (1u << i)) buf[i >> 3] |= (uint8_t)(1u << (i & 7));
|
|
for (i = 0; i < 5; i++)
|
|
if (addr & (1 << i)) { int b = 32 + i; buf[b >> 3] |= (uint8_t)(1u << (b & 7)); }
|
|
if (rw) buf[37 >> 3] |= (uint8_t)(1u << (37 & 7));
|
|
|
|
r = bscan_shift_dr(jc, buf, out ? cap : NULL, 38);
|
|
if (r < 0) return -1;
|
|
|
|
if (out) {
|
|
uint32_t v = 0;
|
|
for (i = 0; i < 32; i++)
|
|
if (cap[i >> 3] & (1u << (i & 7))) v |= (1u << i);
|
|
*out = v;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Select a scan chain via SCAN_N (4-bit register) and enter INTEST so
|
|
* subsequent DR shifts hit that chain. */
|
|
static int chain_select(jtag_core *jc, int chain)
|
|
{
|
|
uint8_t sc = (uint8_t)chain;
|
|
if (bscan_set_ir(jc, IR_SCAN_N, ARM7_IR_LEN) < 0) return -1;
|
|
if (bscan_shift_dr(jc, &sc, NULL, ARM7_IR_LEN) < 0) return -1;
|
|
if (bscan_set_ir(jc, IR_INTEST, ARM7_IR_LEN) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
static int eice_select(jtag_core *jc)
|
|
{
|
|
return chain_select(jc, SC_EICE);
|
|
}
|
|
|
|
static int eice_write(jtag_core *jc, int addr, uint32_t val)
|
|
{
|
|
return eice_scan(jc, addr, 1, val, NULL);
|
|
}
|
|
|
|
/* Read an EmbeddedICE register (two scans: request, then capture). */
|
|
static int eice_read(jtag_core *jc, int addr, uint32_t *val)
|
|
{
|
|
if (eice_scan(jc, addr, 0, 0, NULL) < 0) return -1; /* request */
|
|
if (eice_scan(jc, addr, 0, 0, val) < 0) return -1; /* capture */
|
|
return 0;
|
|
}
|
|
|
|
/* ---- "Quiet" TAP ops: park in Pause, never enter Run-Test/Idle -------
|
|
* The debug core advances one step per Run-Test/Idle clock (Update-DR
|
|
* alone does NOT clock it). After a system-speed access has re-entered
|
|
* debug, any stray RTI clock executes a garbage instruction and clobbers
|
|
* the registers the LDM just loaded. These ops pass through Update but
|
|
* never RTI, so they move the TAP / shift IR & DR without clocking the
|
|
* core. They start from and leave the TAP in a Pause state. */
|
|
|
|
/* Run-Test/Idle -> Pause-DR (the one transition that leaves RTI; the core
|
|
* is either running on MCLK (post-RESTART) or this is a deliberate flush). */
|
|
static void quiet_enter(jtag_core *jc)
|
|
{
|
|
unsigned char tms[4];
|
|
tms[0] = JTAG_STR_TMS; /* RTI -> Select-DR */
|
|
tms[1] = 0; /* -> Capture-DR */
|
|
tms[2] = JTAG_STR_TMS; /* -> Exit1-DR */
|
|
tms[3] = 0; /* -> Pause-DR */
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 4);
|
|
}
|
|
|
|
/* Pause-* -> ... -> Run-Test/Idle (re-arms the normal RTI-based ops). */
|
|
static void quiet_exit(jtag_core *jc)
|
|
{
|
|
unsigned char tms[3];
|
|
tms[0] = JTAG_STR_TMS; /* Pause -> Exit2 */
|
|
tms[1] = JTAG_STR_TMS; /* -> Update */
|
|
tms[2] = 0; /* -> Run-Test/Idle */
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 3);
|
|
}
|
|
|
|
/* Set IR = opcode (len bits), parking in Pause-IR. Starts from any Pause. */
|
|
static int quiet_set_ir(jtag_core *jc, unsigned int opcode, int len)
|
|
{
|
|
unsigned char tms[6], *d;
|
|
int i;
|
|
/* Pause -> Exit2 -> Update -> Select-DR -> Select-IR -> Capture-IR -> Shift-IR */
|
|
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
|
|
tms[3] = JTAG_STR_TMS; tms[4] = 0; tms[5] = 0;
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 6);
|
|
|
|
d = malloc(len);
|
|
if (!d) return -1;
|
|
for (i = 0; i < len; i++) {
|
|
d[i] = ((opcode >> i) & 1u) ? JTAG_STR_DOUT : 0;
|
|
if (i == len - 1) d[i] |= JTAG_STR_TMS; /* last bit -> Exit1-IR */
|
|
}
|
|
jc->io_functions.drv_TXRX_DATA(jc, d, NULL, len);
|
|
free(d);
|
|
|
|
tms[0] = 0; /* Exit1-IR -> Pause-IR */
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 1);
|
|
return 0;
|
|
}
|
|
|
|
/* Shift DR (n bits), parking in Pause-DR. Starts from any Pause. */
|
|
static int quiet_shift_dr(jtag_core *jc, const uint8_t *tdi, uint8_t *tdo, int n)
|
|
{
|
|
unsigned char tms[5], *out, *in = NULL;
|
|
int i;
|
|
/* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */
|
|
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
|
|
tms[3] = 0; tms[4] = 0;
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 5);
|
|
|
|
out = malloc(n);
|
|
if (!out) return -1;
|
|
if (tdo) { in = malloc(n); if (!in) { free(out); return -1; } }
|
|
for (i = 0; i < n; i++) {
|
|
uint8_t b = tdi ? ((tdi[i / 8] >> (i & 7)) & 1u) : 0;
|
|
out[i] = b ? JTAG_STR_DOUT : 0;
|
|
if (i == n - 1) out[i] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */
|
|
}
|
|
jc->io_functions.drv_TXRX_DATA(jc, out, in, n);
|
|
if (tdo && in) {
|
|
memset(tdo, 0, (size_t)((n + 7) / 8));
|
|
for (i = 0; i < n; i++) if (in[i]) tdo[i / 8] |= (uint8_t)(1u << (i & 7));
|
|
}
|
|
free(out); free(in);
|
|
|
|
tms[0] = 0; /* Exit1-DR -> Pause-DR */
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 1);
|
|
return 0;
|
|
}
|
|
|
|
/* Select a scan chain without clocking the core (quiet). Starts from a
|
|
* Pause state, ends in Pause-IR (INTEST loaded). */
|
|
static int quiet_chain_select(jtag_core *jc, int chain)
|
|
{
|
|
uint8_t sc = (uint8_t)chain;
|
|
if (quiet_set_ir(jc, IR_SCAN_N, ARM7_IR_LEN) < 0) return -1;
|
|
if (quiet_shift_dr(jc, &sc, NULL, ARM7_IR_LEN) < 0) return -1;
|
|
if (quiet_set_ir(jc, IR_INTEST, ARM7_IR_LEN) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* Build a 38-bit EmbeddedICE scan-2 frame (data|addr|rw) into buf. */
|
|
static void eice_frame(uint8_t buf[5], int addr, int rw, uint32_t data)
|
|
{
|
|
int i;
|
|
memset(buf, 0, 5);
|
|
for (i = 0; i < 32; i++)
|
|
if (data & (1u << i)) buf[i >> 3] |= (uint8_t)(1u << (i & 7));
|
|
for (i = 0; i < 5; i++)
|
|
if (addr & (1 << i)) { int b = 32 + i; buf[b >> 3] |= (uint8_t)(1u << (b & 7)); }
|
|
if (rw) buf[37 >> 3] |= (uint8_t)(1u << (37 & 7));
|
|
}
|
|
|
|
/* Latch an instruction into the scan-chain-1 instruction register
|
|
* WITHOUT clocking the core: shift the 33-bit frame, pass through
|
|
* Update-DR (latches the parallel output) but never Run-Test/Idle. Used
|
|
* to displace a stale instruction (e.g. the system-speed LDMIA) that
|
|
* would otherwise be re-executed and clobber registers on the next
|
|
* debug clock. Chain 1 + INTEST must be selected; starts/ends in Pause. */
|
|
static int quiet_latch_chain1(jtag_core *jc, uint32_t instr)
|
|
{
|
|
unsigned char tms[6], out[33];
|
|
uint32_t f = flip32(instr);
|
|
int i;
|
|
|
|
/* Pause -> Exit2 -> Update -> Select-DR -> Capture-DR -> Shift-DR */
|
|
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = JTAG_STR_TMS;
|
|
tms[3] = 0; tms[4] = 0;
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 5);
|
|
|
|
out[0] = 0; /* no SYSSPEED */
|
|
for (i = 0; i < 32; i++)
|
|
out[1 + i] = (f & (1u << i)) ? JTAG_STR_DOUT : 0;
|
|
out[32] |= JTAG_STR_TMS; /* last bit -> Exit1-DR */
|
|
jc->io_functions.drv_TXRX_DATA(jc, out, NULL, 33);
|
|
|
|
/* Exit1 -> Update-DR (latch, no clock) -> Select-DR -> Capture-DR
|
|
* -> Exit1 -> Pause-DR */
|
|
tms[0] = JTAG_STR_TMS; tms[1] = JTAG_STR_TMS; tms[2] = 0;
|
|
tms[3] = JTAG_STR_TMS; tms[4] = 0;
|
|
jc->io_functions.drv_TX_TMS(jc, tms, 5);
|
|
return 0;
|
|
}
|
|
|
|
/* Clock the debug core exactly n steps: enter Run-Test/Idle for n TCKs,
|
|
* then return to a Pause state. This is the ONLY thing that advances the
|
|
* core, so callers control the debug-clock count precisely (the chain
|
|
* switches and EICE reads above never enter RTI, hence never clock it).
|
|
* Starts from any Pause state, ends in Pause-DR. */
|
|
static void clock_core(jtag_core *jc, int n)
|
|
{
|
|
unsigned char *tms;
|
|
int i, k = 0;
|
|
if (n < 0) n = 0;
|
|
tms = malloc((size_t)(n + 8));
|
|
if (!tms) return;
|
|
tms[k++] = JTAG_STR_TMS; /* Pause -> Exit2 */
|
|
tms[k++] = JTAG_STR_TMS; /* -> Update */
|
|
tms[k++] = 0; /* -> Run-Test/Idle (1st in-RTI clock) */
|
|
for (i = 1; i < n; i++) tms[k++] = 0; /* dwell: n total RTI clocks */
|
|
tms[k++] = JTAG_STR_TMS; /* RTI -> Select-DR */
|
|
tms[k++] = 0; /* -> Capture-DR */
|
|
tms[k++] = JTAG_STR_TMS; /* -> Exit1-DR */
|
|
tms[k++] = 0; /* -> Pause-DR */
|
|
jc->io_functions.drv_TX_TMS(jc, tms, k);
|
|
free(tms);
|
|
}
|
|
|
|
/* Read an EmbeddedICE register without clocking the core (quiet). The
|
|
* EmbeddedICE chain (#2) must already be selected via quiet ops. */
|
|
static int quiet_eice_read(jtag_core *jc, int addr, uint32_t *val)
|
|
{
|
|
uint8_t buf[5], cap[5];
|
|
uint32_t v = 0;
|
|
int i;
|
|
eice_frame(buf, addr, 0, 0);
|
|
if (quiet_shift_dr(jc, buf, NULL, 38) < 0) return -1; /* request */
|
|
if (quiet_shift_dr(jc, buf, cap, 38) < 0) return -1; /* capture */
|
|
for (i = 0; i < 32; i++)
|
|
if (cap[i >> 3] & (1u << (i & 7))) v |= (1u << i);
|
|
*val = v;
|
|
return 0;
|
|
}
|
|
|
|
int arm_debug_halt(jtag_core *jc, const jtag_target *t)
|
|
{
|
|
uint32_t status = 0;
|
|
(void)t;
|
|
|
|
bscan_tap_reset(jc);
|
|
if (eice_select(jc) < 0)
|
|
return -1;
|
|
|
|
/* DBGRQ -> core enters debug at the next instruction boundary;
|
|
* poll DBGACK (it isn't instantaneous). */
|
|
if (eice_write(jc, EICE_DBG_CTRL, DBG_CTRL_DBGRQ) < 0)
|
|
return -1;
|
|
|
|
{
|
|
int tries;
|
|
for (tries = 0; tries < 100; tries++) {
|
|
bscan_idle_cycles(jc, 64);
|
|
if (eice_read(jc, EICE_DBG_STATUS, &status) < 0)
|
|
return -1;
|
|
if (status & DBG_STATUS_DBGACK)
|
|
break;
|
|
}
|
|
if (!(status & DBG_STATUS_DBGACK)) {
|
|
fprintf(stderr, "arm_debug: halt requested but no DBGACK (status 0x%08x)\n", status);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
/* Debug entry: force DBGACK, deassert DBGRQ (else the core keeps
|
|
* re-requesting debug and injected instructions can't execute), and
|
|
* disable interrupts. Matches OpenOCD's arm7_9_debug_entry. */
|
|
if (eice_write(jc, EICE_DBG_CTRL, DBG_CTRL_DBGACK | DBG_CTRL_INTDIS) < 0)
|
|
return -1;
|
|
|
|
if (status & DBG_STATUS_ITBIT)
|
|
fprintf(stderr, "arm_debug: warning - core halted in Thumb state; "
|
|
"ARM instruction injection will be wrong\n");
|
|
return 0;
|
|
}
|
|
|
|
int arm_debug_resume(jtag_core *jc, const jtag_target *t)
|
|
{
|
|
(void)t;
|
|
if (eice_select(jc) < 0)
|
|
return -1;
|
|
/* Clear DBGRQ, then RESTART exits debug state. */
|
|
if (eice_write(jc, EICE_DBG_CTRL, 0x0) < 0)
|
|
return -1;
|
|
if (bscan_set_ir(jc, IR_RESTART, ARM7_IR_LEN) < 0)
|
|
return -1;
|
|
bscan_idle_cycles(jc, 16);
|
|
return 0;
|
|
}
|
|
|
|
/* Scan-chain-1 (debug bus) access session. Each access is one
|
|
* bscan_shift_dr of the 33-bit frame, which captures the bus at
|
|
* Capture-DR, applies the instruction at Update-DR and advances the core
|
|
* exactly one debug step (Update -> Run-Test/Idle) — one access == one
|
|
* debug clock. The captured value reflects the bus from the previous
|
|
* step's instruction, the standard ARM7TDMI pipeline that the NOP padding
|
|
* in read/write_core_regs accounts for. (c1_init/c1_end bracket a run of
|
|
* accesses; c1_end is currently a no-op since bscan_shift_dr self-completes
|
|
* each access, but callers must still avoid chain switches mid-run — those
|
|
* clock the halted core and shift the pipeline phase.) */
|
|
typedef struct {
|
|
jtag_core *jc;
|
|
int started;
|
|
} c1_ctx;
|
|
|
|
static void c1_init(c1_ctx *c, jtag_core *jc) { c->jc = jc; c->started = 0; }
|
|
|
|
/* One chain-1 access: shift 33 bits = breakpoint[0] | flip32(instr)[1..32].
|
|
* sysspeed=1 marks the following instruction to run at system speed.
|
|
* capture != NULL reads back the 32-bit debug data bus. */
|
|
static int c1_xfer(c1_ctx *c, uint32_t instr, int sysspeed, uint32_t *capture)
|
|
{
|
|
uint8_t buf[5], cap[5];
|
|
uint32_t f = flip32(instr);
|
|
int i;
|
|
|
|
memset(buf, 0, sizeof(buf));
|
|
if (sysspeed) buf[0] |= 1u; /* bit 0 = breakpoint/SYSSPEED */
|
|
for (i = 0; i < 32; i++) /* bits 1..32 = flip32(instr) */
|
|
if (f & (1u << i)) { int b = 1 + i; buf[b >> 3] |= (uint8_t)(1u << (b & 7)); }
|
|
|
|
/* Shift 33 bits: captures the bus at Capture-DR, applies the
|
|
* instruction at Update-DR and advances the core exactly one debug
|
|
* step via the Update->Run-Test/Idle transition. One access == one
|
|
* debug clock (an extra idle dwell would double-clock the pipeline). */
|
|
if (bscan_shift_dr(c->jc, buf, capture ? cap : NULL, 33) < 0)
|
|
return -1;
|
|
c->started = 1;
|
|
|
|
if (capture) {
|
|
uint32_t raw = 0;
|
|
for (i = 0; i < 32; i++) {
|
|
int b = 1 + i;
|
|
if (cap[b >> 3] & (1u << (b & 7))) raw |= (1u << i);
|
|
}
|
|
*capture = flip32(raw);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int c1_end(c1_ctx *c) { (void)c; return 0; }
|
|
|
|
/* Load core registers from the debug data bus (debug speed):
|
|
* LDMIA r<rn>, {regs} fed by the scanned-in values. */
|
|
static int write_core_regs(c1_ctx *c, int rn, uint32_t mask, const uint32_t *vals)
|
|
{
|
|
int i;
|
|
if (c1_xfer(c, ARM_LDMIA(rn, mask & 0xffff, 0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1; /* DECODE */
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1; /* EXECUTE 1 */
|
|
for (i = 0; i <= 15; i++)
|
|
if (mask & (1u << i))
|
|
if (c1_xfer(c, vals[i], 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* Read core registers from the debug data bus (debug speed):
|
|
* STMIA r<rn>, {regs}; values appear from the 4th DCLK on. */
|
|
static int read_core_regs(c1_ctx *c, int rn, uint32_t mask, uint32_t *out)
|
|
{
|
|
int i;
|
|
if (c1_xfer(c, ARM_STMIA(rn, mask & 0xffff, 0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1; /* DECODE */
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1; /* EXECUTE 1 */
|
|
for (i = 0; i <= 15; i++)
|
|
if (mask & (1u << i))
|
|
if (c1_xfer(c, ARM_NOP, 0, &out[i]) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* Queue a system-speed load-multiple from real memory into {regs}, with
|
|
* base writeback so r0 advances for the next block. The instruction
|
|
* preceding it carries the SYSSPEED bit. */
|
|
static int load_word_regs(c1_ctx *c, uint32_t mask)
|
|
{
|
|
if (c1_xfer(c, ARM_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_NOP, 1, NULL) < 0) return -1; /* SYSSPEED marker */
|
|
if (c1_xfer(c, ARM_LDMIA(0, mask & 0xffff, 1), 0, NULL) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* Switch a Thumb-state core to ARM state so the rest of the debug logic
|
|
* can use ARM instructions (mirrors OpenOCD's arm7tdmi_change_to_arm).
|
|
* Clobbers r0 and PC (fine for a read-then-power-cycle flow): loads r0
|
|
* with an even address and BX r0. Thumb instructions are injected as
|
|
* 16-bit opcodes duplicated into both halfwords. Assumes chain 1 +
|
|
* INTEST selected; the caller wraps it in a c1 session. */
|
|
static int change_to_arm(c1_ctx *c)
|
|
{
|
|
/* save r0 (STR r0,[r0]); value discarded */
|
|
if (c1_xfer(c, ARM_T_STR(0, 0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, 0, 0, NULL) < 0) return -1; /* data-in slot */
|
|
|
|
/* read pc (MOV r0,r15; STR r0,[r0]); value discarded */
|
|
if (c1_xfer(c, ARM_T_MOV(0, 15), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_STR(0, 0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, 0, 0, NULL) < 0) return -1; /* data-in slot */
|
|
|
|
/* LDR r0,[PC,#0] with data 0 -> r0 = 0 (bits[1:0] cleared) */
|
|
if (c1_xfer(c, ARM_T_LDR_PCREL(0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, 0x0, 0, NULL) < 0) return -1; /* LDR data word */
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
|
|
/* BX r0 -> ARM state */
|
|
if (c1_xfer(c, ARM_T_BX(0), 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
if (c1_xfer(c, ARM_T_NOP, 0, NULL) < 0) return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* RESTART, then wait for the system-speed access to complete (DBGACK &
|
|
* SYSCOMP). Leaves the TAP on the EmbeddedICE chain. */
|
|
static int execute_sys_speed(jtag_core *jc)
|
|
{
|
|
uint32_t status = 0;
|
|
int tries;
|
|
|
|
/* RESTART resumes the core to run the one system-speed access. The
|
|
* core needs debug clocks to step through it, but once it re-enters
|
|
* debug any further clock executes a stale instruction and clobbers
|
|
* the loaded registers. So drive it ONE clock at a time and check
|
|
* DBG_STATUS QUIETLY (no clock) between, stopping the instant
|
|
* SYSCOMP & DBGACK appear. Leave the TAP parked (Pause) on EICE. */
|
|
if (bscan_set_ir(jc, IR_RESTART, ARM7_IR_LEN) < 0) return -1;
|
|
|
|
quiet_enter(jc);
|
|
if (quiet_chain_select(jc, SC_EICE) < 0) return -1;
|
|
for (tries = 0; tries < 2000; tries++) {
|
|
if (quiet_eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1;
|
|
if ((status & DBG_STATUS_DBGACK) && (status & DBG_STATUS_SYSCOMP))
|
|
return 0;
|
|
clock_core(jc, 1);
|
|
}
|
|
fprintf(stderr, "arm_debug: sys-speed access timed out (status 0x%08x)\n", status);
|
|
return -1;
|
|
}
|
|
|
|
/* Read memory by instruction injection. Reads word-aligned blocks
|
|
* covering [addr, addr+len) and copies the requested bytes out.
|
|
* Core registers r0..r14 are clobbered (acceptable for a read-then-
|
|
* power-cycle flow). The core must already be halted (DBGACK).
|
|
*
|
|
* Reads real memory correctly (validated by an objcopy-verified 32 KB
|
|
* flash dump of the LPC2103). Intended flow is power-on -> one halt ->
|
|
* dump; see the file header and the arm7-debug-dclk-timing note for the
|
|
* repeated-halt caveat. */
|
|
int arm_debug_mem_read(jtag_core *jc, const jtag_target *t,
|
|
unsigned long addr, void *buf, unsigned long len)
|
|
{
|
|
unsigned long base = addr & ~3UL;
|
|
unsigned long end = addr + len;
|
|
unsigned long total_words = (((end + 3) & ~3UL) - base) / 4;
|
|
unsigned long done = 0;
|
|
uint32_t r0, status = 0;
|
|
uint8_t *out = buf;
|
|
c1_ctx c1;
|
|
(void)t;
|
|
|
|
if (!buf || len == 0) return -1;
|
|
|
|
/* If the core halted in Thumb state, switch it to ARM. Do the EICE
|
|
* status read first (the chain switch clocks the halted core), then
|
|
* the switch in one continuous chain-1 session so no stray clocks
|
|
* land between change_to_arm and the first instruction. */
|
|
if (chain_select(jc, SC_EICE) < 0) return -1;
|
|
if (eice_read(jc, EICE_DBG_STATUS, &status) < 0) return -1;
|
|
if (chain_select(jc, SC_DEBUG) < 0) return -1;
|
|
/* Debug entry, mirroring OpenOCD's arm7_9_debug_entry to leave a
|
|
* deterministic pipeline regardless of halt state: switch Thumb->ARM
|
|
* if needed, then read all 16 core registers. That STMIA+NOP+NOP+16
|
|
* sequence flushes the firmware out of the pipeline and ends in the
|
|
* same known state for both the Thumb and ARM paths, so the first
|
|
* system-speed read reliably re-enters debug. */
|
|
{
|
|
uint32_t scratch[16];
|
|
c1_init(&c1, jc);
|
|
if (status & DBG_STATUS_ITBIT)
|
|
if (change_to_arm(&c1) < 0) return -1;
|
|
memset(scratch, 0, sizeof(scratch));
|
|
if (read_core_regs(&c1, 0, 0xffff, scratch) < 0) return -1;
|
|
c1_end(&c1);
|
|
}
|
|
|
|
/* WARM-UP: the first system-speed read after debug entry normalizes
|
|
* the sys-speed pipeline but its own result is unreliable. Do one
|
|
* throwaway read block and discard it; every read after it is
|
|
* consistent and correct. (Like the FTDI stale-first-read, but for
|
|
* the ARM debug pipeline.) */
|
|
{
|
|
uint32_t scratch[16];
|
|
r0 = (uint32_t)base;
|
|
c1_init(&c1, jc);
|
|
if (write_core_regs(&c1, 0, 0x1, &r0) < 0) return -1;
|
|
if (load_word_regs(&c1, 0x7ffe) < 0) return -1; /* r1..r14 */
|
|
c1_end(&c1);
|
|
if (execute_sys_speed(jc) < 0) return -1;
|
|
if (quiet_chain_select(jc, SC_DEBUG) < 0) return -1;
|
|
if (quiet_latch_chain1(jc, ARM_NOP) < 0) return -1;
|
|
quiet_exit(jc);
|
|
memset(scratch, 0, sizeof(scratch));
|
|
c1_init(&c1, jc);
|
|
if (read_core_regs(&c1, 0, 0x7ffe, scratch) < 0) return -1;
|
|
c1_end(&c1);
|
|
}
|
|
|
|
r0 = (uint32_t)base;
|
|
|
|
while (done < total_words) {
|
|
uint32_t regs[16];
|
|
uint32_t reg_list;
|
|
unsigned long n = total_words - done;
|
|
unsigned long i;
|
|
if (n > 14) n = 14;
|
|
|
|
/* r1..rn, base (r0) excluded so it can be the autoincrement ptr. */
|
|
reg_list = (uint32_t)((0xffffu >> (15 - n)) & 0xfffe);
|
|
|
|
/* On chain 1 (from change_to_arm or the previous read): set r0
|
|
* once, then queue the system-speed LDM. */
|
|
c1_init(&c1, jc);
|
|
if (done == 0) /* LDM writeback advances r0 after */
|
|
if (write_core_regs(&c1, 0, 0x1, &r0) < 0) return -1;
|
|
if (load_word_regs(&c1, reg_list) < 0) return -1;
|
|
c1_end(&c1);
|
|
|
|
/* RESTART + quiet poll; leaves the TAP parked on EmbeddedICE. */
|
|
if (execute_sys_speed(jc) < 0) return -1;
|
|
|
|
/* Switch back to chain 1 WITHOUT clocking the core (a normal
|
|
* chain_select would clobber the just-loaded registers), then
|
|
* read them out. execute_sys_speed left us parked (Pause) on the
|
|
* EmbeddedICE chain. */
|
|
if (quiet_chain_select(jc, SC_DEBUG) < 0) return -1;
|
|
/* Displace the stale (system-speed LDMIA) instruction with a NOP
|
|
* so the first debug clock re-executes a NOP, not the LDMIA
|
|
* (which would reload r1..rn from the debug bus and lose the
|
|
* memory data). */
|
|
if (quiet_latch_chain1(jc, ARM_NOP) < 0) return -1;
|
|
quiet_exit(jc);
|
|
memset(regs, 0, sizeof(regs));
|
|
c1_init(&c1, jc);
|
|
if (read_core_regs(&c1, 0, reg_list, regs) < 0) return -1;
|
|
c1_end(&c1);
|
|
|
|
for (i = 0; i < n; i++) {
|
|
unsigned long word_addr = base + (done + i) * 4;
|
|
uint32_t w = regs[1 + i];
|
|
int b;
|
|
for (b = 0; b < 4; b++) {
|
|
unsigned long byte_addr = word_addr + b;
|
|
if (byte_addr >= addr && byte_addr < end)
|
|
out[byte_addr - addr] = (uint8_t)(w >> (8 * b));
|
|
}
|
|
}
|
|
done += n;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int arm_debug_mem_write(jtag_core *jc, const jtag_target *t,
|
|
unsigned long addr, const void *buf, unsigned long len)
|
|
{
|
|
(void)jc; (void)t; (void)addr; (void)buf; (void)len;
|
|
fprintf(stderr, "arm_debug: mem_write not implemented yet\n");
|
|
return -1;
|
|
}
|
|
|
|
int arm_flash_program(jtag_core *jc, const jtag_target *t, const char *file,
|
|
arm_log_fn log, void *user)
|
|
{
|
|
char msg[256];
|
|
(void)jc; (void)file;
|
|
if (log) {
|
|
snprintf(msg, sizeof(msg),
|
|
"arm_flash: backend not implemented yet. "
|
|
"Target '%s' debug=%d ram=0x%lX+0x%lX flash=0x%lX+0x%lX.",
|
|
t ? t->name : "?",
|
|
t ? (int)t->cpu.debug : 0,
|
|
t ? t->cpu.ram_base : 0UL, t ? t->cpu.ram_size : 0UL,
|
|
t ? t->cpu.flash_base : 0UL, t ? t->cpu.flash_size : 0UL);
|
|
log(user, 1, msg);
|
|
}
|
|
return -1;
|
|
}
|