diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml new file mode 100644 index 00000000..c76d5cad --- /dev/null +++ b/.github/workflows/amd.yml @@ -0,0 +1,178 @@ +name: AMD/Xilinx ports + +# Build (and QEMU boot-smoke) the bare-metal wolfIP ports under src/port/amd/: +# ZCU102 (A53), Versal VMK180 (A72) and Zynq-7000 ZC702 (A9). app.elf builds +# from repo sources only - no Vitis/FSBL/bootgen/hardware. (BOOT.BIN needs an +# FSBL + bootgen and is out of scope here.) + +on: + push: + paths: + - 'src/port/amd/**' + - 'src/wolfip.c' + - 'wolfip.h' + - 'tools/scripts/amd/**' + - '.github/workflows/amd.yml' + pull_request: + paths: + - 'src/port/amd/**' + - 'src/wolfip.c' + - 'wolfip.h' + - 'tools/scripts/amd/**' + - '.github/workflows/amd.yml' + +# Cancel superseded runs on the same ref (runner optimization). +concurrency: + group: amd-${{ github.ref }} + cancel-in-progress: true + +env: + ARM_TC_VER: 14.3.rel1 + TC_ROOT: /home/runner/toolchains + +jobs: + # -------------------------------------------------------------------------- + # Prime the toolchain cache once so the matrix legs don't each re-download + # ~150 MB. Pin the official ARM GNU Toolchain bundle that ships BOTH cross + # compilers (aarch64-none-elf for ZCU102/Versal, arm-none-eabi for Zynq-7000). + # -------------------------------------------------------------------------- + toolchains: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - name: Cache ARM GNU toolchains + id: tc-cache + uses: actions/cache@v4 + with: + path: ${{ env.TC_ROOT }} + key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64 + + - name: Download + extract toolchains + if: steps.tc-cache.outputs.cache-hit != 'true' + run: | + set -euxo pipefail + mkdir -p "$TC_ROOT" + base="https://developer.arm.com/-/media/Files/downloads/gnu/${ARM_TC_VER}/binrel" + for t in aarch64-none-elf arm-none-eabi; do + f="arm-gnu-toolchain-${ARM_TC_VER}-x86_64-${t}.tar.xz" + curl -fSL --retry 3 --retry-delay 5 -o "/tmp/$f" "$base/$f" + tar -xf "/tmp/$f" -C "$TC_ROOT" + done + ls -d "$TC_ROOT"/*/ + + # -------------------------------------------------------------------------- + # Full build matrix: per board x layout x default/SPEED_TEST (~10 legs). + # -Werror is already in each board's CFLAGS, so a clean compile is the gate. + # -------------------------------------------------------------------------- + build: + needs: toolchains + runs-on: ubuntu-latest + timeout-minutes: 10 + strategy: + fail-fast: false + matrix: + include: + - { board: zcu102, cross: aarch64-none-elf-, layout: ocm, speed: "" } + - { board: zcu102, cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" } + - { board: zcu102, cross: aarch64-none-elf-, layout: ddr, speed: "" } + - { board: zcu102, cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" } + - { board: versal, cross: aarch64-none-elf-, layout: ocm, speed: "" } + - { board: versal, cross: aarch64-none-elf-, layout: ocm, speed: "-DSPEED_TEST" } + - { board: versal, cross: aarch64-none-elf-, layout: ddr, speed: "" } + - { board: versal, cross: aarch64-none-elf-, layout: ddr, speed: "-DSPEED_TEST" } + - { board: zynq7000, cross: arm-none-eabi-, layout: ocm, speed: "" } + - { board: zynq7000, cross: arm-none-eabi-, layout: ocm, speed: "-DSPEED_TEST" } + steps: + - uses: actions/checkout@v4 + + - name: Restore toolchains + uses: actions/cache/restore@v4 + with: + path: ${{ env.TC_ROOT }} + key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64 + fail-on-cache-miss: true + + - name: Add toolchains to PATH + run: | + echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH" + echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH" + + - name: Build ${{ matrix.board }} (${{ matrix.layout }}${{ matrix.speed && ' SPEED' || '' }}) + run: | + set -euxo pipefail + ${{ matrix.cross }}gcc --version | head -1 + args="CROSS_COMPILE=${{ matrix.cross }}" + # zynq7000 is OCM-only (no LAYOUT switch / no target_ddr.ld). + if [ "${{ matrix.board }}" != "zynq7000" ]; then + args="$args LAYOUT=${{ matrix.layout }}" + fi + if [ -n "${{ matrix.speed }}" ]; then + args="$args CFLAGS_EXTRA=${{ matrix.speed }}" + fi + make -C "src/port/amd/boards/${{ matrix.board }}" $args + ${{ matrix.cross }}size "src/port/amd/boards/${{ matrix.board }}/app.elf" + + - name: Upload app.elf + uses: actions/upload-artifact@v4 + with: + name: amd-${{ matrix.board }}-${{ matrix.layout }}-${{ matrix.speed != '' && 'speed' || 'default' }} + path: src/port/amd/boards/${{ matrix.board }}/app.elf + if-no-files-found: error + + # -------------------------------------------------------------------------- + # QEMU boot smoke: build the OCM/default app per board and confirm it boots + # to "Ready" under the matching Xilinx QEMU machine. zcu102 gates; versal and + # zynq7000 are informational (continue-on-error) until their QEMU device + # models are confirmed - the machine/UART/load details may need iteration. + # -------------------------------------------------------------------------- + qemu: + needs: toolchains + runs-on: ubuntu-latest + timeout-minutes: 15 + strategy: + fail-fast: false + matrix: + include: + - { board: zcu102, cross: aarch64-none-elf-, gate: true } + - { board: versal, cross: aarch64-none-elf-, gate: false } + - { board: zynq7000, cross: arm-none-eabi-, gate: false } + continue-on-error: ${{ !matrix.gate }} + steps: + - uses: actions/checkout@v4 + + - name: Restore toolchains + uses: actions/cache/restore@v4 + with: + path: ${{ env.TC_ROOT }} + key: arm-gnu-${{ env.ARM_TC_VER }}-x86_64 + fail-on-cache-miss: true + + - name: Add toolchains to PATH + run: | + echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-aarch64-none-elf/bin" >> "$GITHUB_PATH" + echo "$TC_ROOT/arm-gnu-toolchain-${ARM_TC_VER}-x86_64-arm-none-eabi/bin" >> "$GITHUB_PATH" + + - name: Install QEMU + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + qemu-system-arm qemu-system-aarch64 + + - name: Build ${{ matrix.board }} (OCM, default) + run: | + set -euxo pipefail + make -C "src/port/amd/boards/${{ matrix.board }}" CROSS_COMPILE=${{ matrix.cross }} + + - name: QEMU boot smoke + run: | + chmod +x tools/scripts/amd/qemu-smoke.sh + UART_LOG="uart-${{ matrix.board }}.log" \ + tools/scripts/amd/qemu-smoke.sh "${{ matrix.board }}" + + - name: Upload UART log + if: always() + uses: actions/upload-artifact@v4 + with: + name: qemu-uart-${{ matrix.board }} + path: uart-${{ matrix.board }}.log + if-no-files-found: warn diff --git a/src/port/amd/README.md b/src/port/amd/README.md new file mode 100644 index 00000000..e3114037 --- /dev/null +++ b/src/port/amd/README.md @@ -0,0 +1,152 @@ +# wolfIP AMD/Xilinx bare-metal ports + +Bare-metal wolfIP ports for AMD/Xilinx PS-GEM SoCs, sharing one tree: + +- **ZCU102** - ZynqMP, Cortex-A53, AArch64, EL3 +- **Versal Gen 1 / VMK180** - Cortex-A72, AArch64, EL3 +- **Zynq-7000 / ZC702** - Cortex-A9, ARMv7-A, SVC + +All three are brought up on real hardware (DHCP, ICMP ping, UDP echo). + +## Layout + +Shared code lives once; each board's Makefile selects which components to +compile (build-selected files, not `#ifdef` forks). + +``` +common/ arch- and SoC-independent + app.c app.h shared UDP-echo + DHCP demo (board hooks: board.c) + gem_core.c gem.h shared Cadence GEM core (init, MDIO, polled TX, diag) + gem_regs.h gem_port.h GEM register map / internal hook interface + uart_util.c shared UART helpers (puts/puthex/putdec/putip4) + entropy.c memuse-pattern RNG (counter via arch_counter64) + wolfip_config.h shared wolfIP profile (board config.h includes it) + gic.h uart.h mmu.h driver API headers + +arch/aarch64/ cache.h timer.h mmu_aarch64.c startup_aarch64.S exception_aarch64.c +arch/armv7/ cache.h timer.h mmu_armv7.c startup_armv7.S + +ip/ per-IP-block drivers (build-selected) + uart_cadence.c uart_pl011.c UART + gic_gicv2.c gic_gicv3.c GIC + gem_swq.c gem_rx_swq_poll.c gem_rx_poll.c RX delivery model (all boards poll) + gem_rx_irq.c reference IRQ-driven RX (not built; see file) + phy_dp83867.c phy_marvell.c PHY drivers + phy_dispatch_dp83867.c phy_dispatch_multi.c PHY vendor dispatch + +boards// the build root for each board (keeps app.elf + JTAG in place) + board.h board.c board_gem.c config.h Makefile target*.ld jtag/ [bootgen/] +``` + +## Component selection per board + +| Component | ZCU102 | Versal | Zynq-7000 | +|-----------|--------|--------|-----------| +| arch | aarch64 | aarch64 | armv7 | +| UART | cadence | pl011 | cadence | +| GIC | gicv2 | gicv3 | gicv2 | +| GEM RX | gem_rx_swq_poll + gem_swq | gem_rx_swq_poll + gem_swq | gem_rx_poll | +| PHY | dp83867 | dp83867 | dp83867 + marvell (multi) | +| GEM inst | GEM3 | GEM0 | GEM0 | + +## Build + +``` +cd boards/zcu102 && make CROSS_COMPILE=aarch64-none-elf- +cd boards/versal && make CROSS_COMPILE=aarch64-none-elf- +cd boards/zynq7000 && make CROSS_COMPILE=arm-none-eabi- +``` + +Output is `app.elf` in the board directory. See each board's `README.md` +for the JTAG / BOOT.BIN flow and bring-up notes. + +## Throughput test (SPEED_TEST) + +The default build runs the UDP echo + DHCP demo. Building with +`CFLAGS_EXTRA=-DSPEED_TEST` instead brings up a TCP throughput server on +**port 9** (a discard/chargen-style sink + source, in the spirit of iperf but +without iperf3's JSON control channel, which is impractical on bare metal). On +each accepted connection the board sinks everything the host sends (RX) and, in +the same window, sources chargen data whenever the socket is writable (TX); on +close it prints the byte totals and an average rate over the UART: + +``` +cd boards/zcu102 && make CROSS_COMPILE=aarch64-none-elf- CFLAGS_EXTRA=-DSPEED_TEST +``` + +Measure from a host on the same subnet as the board (replace `` with the +leased address printed at DHCP bind): + +``` +# RX (host -> board): how fast the board sinks +dd if=/dev/zero bs=1460 count=20000 | nc -q1 9 + +# TX (board -> host): how fast the board sources +nc 9 /dev/null +``` + +The board's own `SPEED done ... RX/TX bytes (~B/s)` UART line is the +authoritative figure (it times the connection with the hardware clock). Note +the RX and TX counters cover the same connection window, so during the RX run +the board is also back-sourcing; the printed RX B/s is the host->board goodput +under that concurrent load. iperf3 host-to-host on the same link is a useful +*link* reference, but the board is not an iperf3 endpoint. + +The `SPEED_TEST` build also widens the TCP window (`RXBUF_SIZE`/`TXBUF_SIZE` to +`LINK_MTU * 6` in `config.h`) and trims the UDP socket count to keep the larger +per-socket buffers inside the 256 KB OCM budget. + +### Results + +Single Cortex core, 1 Gbps RGMII link, MTU 1500, host on the same switch. +RX is the board's UART `~B/s` line (host -> board); TX is host-measured +(board -> host). Bytes x8 for Mbps. + +| Board (SoC, core) | Layout / boot | RX Mbps | TX Mbps | +|------------------------------|-----------------|--------:|--------:| +| VMK180 (Versal, A72 @ EL3) | DDR (JTAG) | ~300 | ~334 | +| ZCU102 (ZynqMP, A53 @ EL3) | DDR (SD boot) | ~126 | ~194 | +| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 | +| ZCU102 (ZynqMP, A53 @ EL3) | OCM (JTAG) | ~10 | ~9 | + +The single dominant factor is the **memory layout**: the OCM layout runs *all* +code (and the rings) from Normal non-cacheable OCM, so every instruction fetch +and frame copy is uncached. The DDR layout keeps code+data in cacheable DDR and +maps only the GEM DMA region non-cacheable - ~13-30x faster, as the two ZCU102 +rows show directly (same SoC/core, OCM ~10/9 vs DDR ~126/194 Mbps). The faster +A72 (Versal) reaches ~300/334 on DDR. + +How each DDR number was loaded: Versal's PLM trains DDR from a boot PDI, so the +DDR app loads cleanly over JTAG. On ZynqMP, JTAG writes into DDR after a bare +`psu_init` are unreliable (the load goes through the A53 with a cache flush and +either errors or lands corrupt - DDR itself is fine, a direct DAP memtest passes), +so the ZCU102 DDR figure is from an **SD boot**: `FSBL_ELF=.../zynqmp_fsbl.elf +make bootbin` produces a DDR-layout `BOOT.BIN` that the FSBL trains DDR for and +DMA-loads (no JTAG memory writes). Copy it to the SD card's FAT boot partition +and set SW6 = SD. The same applies to ZC702 (its OCM-only port has no DDR layout +yet; a DDR profile is future work). + +What it took to get here: + +1. **NC-map the DMA rings in the DDR layout (correctness, not just speed).** + The DDR layout had mapped the GEM BD rings cacheable with per-BD + `cache_clean`. Because the 8-byte BDs share 64-byte cache lines, cleaning one + BD wrote stale neighbours back over MAC-set OWN bits and wedged the RX ring + under sustained (TCP-rate) load - the UDP-only profile never had two BDs live + in a line at once. The DMA region is now Normal-NC in both layouts, with + `.dma_buffers` in its own 2 MB block so `.text` stays cacheable. +2. **Main-loop poll cadence.** The original loop called `wolfIP_poll()` then + `delay_ms(1)`, capping the stack at ~1 poll/ms (~12 Mbps) and feeding wolfIP + a `tick++` counter that only approximated real milliseconds. It now + busy-polls with a real-millisecond clock from the hardware timer + (`timer_now()/timer_freq()`), which also de-skews every DHCP/TCP/ARP timeout. +3. **Drain RX fully, bounded TX per event.** Reading one chunk per READABLE + left the advertised TCP window stuck (~2 KB) and deadlocked; the SPEED server + now drains the rx buffer each event and does a bounded tx fill. +4. **Word-wise `memcpy`/`memset`.** Frame-staging copies are 8 bytes at a time + (bytewise tail), which matters for the non-cacheable DMA buffers. + +Notes / remaining levers: ZCU102 uses the same poll-driven RX as the other two +boards - its original IRQ-driven RX storms the CPU under sustained RX load. +A DDR/BOOT.BIN profile for the OCM boards (cached code) and draining more than +one frame per poll are the next levers. diff --git a/src/port/amd/arch/aarch64/cache.h b/src/port/amd/arch/aarch64/cache.h new file mode 100644 index 00000000..a2524721 --- /dev/null +++ b/src/port/amd/arch/aarch64/cache.h @@ -0,0 +1,41 @@ +/* cache.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * AArch64 (Cortex-A53/A72) cache maintenance for GEM DMA coherency. The + * cache line is 64 bytes. With D-cache enabled and BD/buffers in normal + * cacheable memory, CPU writes may sit in L1 and not be visible to the + * MAC's DMA path. cache_clean() writes back dirty lines before DMA + * reads; cache_inval() invalidates lines so CPU reads pull fresh + * DMA-written data. + */ +#ifndef AMD_CACHE_H +#define AMD_CACHE_H + +#include + +#define CACHE_LINE 64u + +static inline void cache_clean(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc cvac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +static inline void cache_inval(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc ivac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +#endif /* AMD_CACHE_H */ diff --git a/src/port/amd/arch/aarch64/exception_aarch64.c b/src/port/amd/arch/aarch64/exception_aarch64.c new file mode 100644 index 00000000..c3698765 --- /dev/null +++ b/src/port/amd/arch/aarch64/exception_aarch64.c @@ -0,0 +1,55 @@ +/* exception_aarch64.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * AArch64 EL3 fault reporters, called from the startup.S vector + * trampolines on a synchronous exception or SError/FIQ. ARMv7 ports + * just hang on faults (no C reporter), so this file is AArch64-only. + */ +#include +#include "uart.h" + +/* Called from startup.S vector trampoline on synchronous/SError fault. */ +void exception_report(uint64_t esr, uint64_t elr, uint64_t far, uint64_t spsr) +{ + uart_puts("\n\n*** EL3 SYNC EXCEPTION ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" EC = "); uart_puthex((uint32_t)((esr >> 26) & 0x3F)); + uart_puts(" (0x21=instr abort, 0x25=data abort, 0x24=alignment)\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} + +void exception_report_serror(uint64_t esr, uint64_t elr, uint64_t far, + uint64_t spsr, uint64_t kind) +{ + (void)kind; + uart_puts("\n\n*** EL3 SError / FIQ ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} diff --git a/src/port/amd/arch/aarch64/mmu_aarch64.c b/src/port/amd/arch/aarch64/mmu_aarch64.c new file mode 100644 index 00000000..b1939f7d --- /dev/null +++ b/src/port/amd/arch/aarch64/mmu_aarch64.c @@ -0,0 +1,242 @@ +/* mmu.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Minimal MMU bring-up for A53/A72 at EL3 with a 32-bit virtual address + * space (T0SZ=32, start level L1). Static tables map the full 4 GB VA + * range: + * + * L1[0] -> L2_DDR (0x00000000 .. 0x3FFFFFFF, 1 GB, 2 MB granular) + * L1[1] -> 0x40000000 .. 0x7FFFFFFF Normal WB IS (1 GB block) + * L1[2] -> invalid (0x80000000 .. 0xBFFFFFFF) + * L1[3] -> L2_PERIPH (0xC0000000 .. 0xFFFFFFFF, 1 GB, 2 MB granular) + * + * L2_DDR has a Normal-NC carve-out for any 2 MB block overlapping the + * linker's [_dma_buffers_start, _dma_buffers_end) range. In the current + * OCM-only layout the .dma_buffers section lives in OCM (mapped via + * L2_PERIPH[511] Normal-WB), so this carve-out is effectively dormant - + * GEM DMA coherency is handled with explicit DC CVAC / IVAC ops in + * gem.c. The carve-out remains in the tables so a future DDR-resident + * layout works without an MMU change. + * + * L2_PERIPH covers the PS peripheral aperture as Device-nGnRnE except + * entry 511 (0xFFE00000..0xFFFFFFFF) which is Normal-WB executable so + * code can be fetched from OCM (0xFFFC0000..0xFFFFFFFF) after the MMU + * is enabled. + * + * MAIR_EL3: + * ATTR0 = 0xFF (Normal Inner+Outer WB Cacheable, Read+Write alloc) + * ATTR1 = 0x00 (Device-nGnRnE - PS peripherals) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable - reserved for a + * future DDR DMA carve-out) + * + * Block descriptor low attributes: + * bits[1:0] = 0b01 block + * bits[5:2] = AttrIndx + * bits[7:6] = AP = 0 (RW at EL3) + * bits[9:8] = SH = 0b11 inner-shareable (only meaningful for Normal) + * bit[10] = AF = 1 + * bit[54] = UXN/XN = 1 for Device, 0 for Normal RX + */ +#include +#include "mmu.h" + +extern uint8_t _dma_buffers_start[]; +extern uint8_t _dma_buffers_end[]; + +/* L1 has 4 entries (one per GB in our 4 GB VA). Section attribute keeps + * it in the dedicated .page_tables area so it lives at a known DDR + * address - the MMU walker still uses physical addresses to read it. */ +static volatile uint64_t L1[512] + __attribute__((aligned(4096), section(".page_tables"))); +static volatile uint64_t L2_DDR[512] + __attribute__((aligned(4096), section(".page_tables"))); +/* L2 for the 3-4 GB region. Most blocks are Device (PS peripherals) + * but the 2 MB block at 0xFFE00000 - 0xFFFFFFFF must be Normal+exec + * because OCM (0xFFFC0000-0xFFFFFFFF) lives there and our code runs + * from OCM. */ +static volatile uint64_t L2_PERIPH[512] + __attribute__((aligned(4096), section(".page_tables"))); + +#define DESC_VALID (1ULL << 0) +#define DESC_TABLE (1ULL << 1) +#define DESC_BLOCK (0ULL << 1) +#define DESC_AF (1ULL << 10) +#define DESC_SH_INNER (3ULL << 8) +#define DESC_AP_RW_EL3 (0ULL << 6) +#define DESC_XN (1ULL << 54) +#define DESC_ATTR(i) (((uint64_t)(i) & 7ULL) << 2) + +#define ATTR_NORMAL 0 /* AttrIndx 0 = MAIR ATTR0 (Normal WB) */ +#define ATTR_DEVICE 1 /* AttrIndx 1 = MAIR ATTR1 (Device) */ +#define ATTR_NORMAL_NC 2 /* AttrIndx 2 = MAIR ATTR2 (Normal NC) */ + +#define BLOCK_NORMAL(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL)) + +#define BLOCK_DEVICE(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_AP_RW_EL3 | DESC_ATTR(ATTR_DEVICE) | DESC_XN) + +#define BLOCK_NORMAL_NC(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL_NC) | DESC_XN) + +/* Normal Non-Cacheable but executable (no XN) -- for the OCM block in the + * OCM layout, where code, data and the GEM BD rings all share OCM. */ +#define BLOCK_NORMAL_NC_EXEC(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL_NC)) + +#define TABLE_DESC(pa) \ + (((uint64_t)(pa)) | DESC_TABLE | DESC_VALID) + +#define L2_BLOCK_SIZE (2ULL * 1024 * 1024) /* 2 MB */ +#define L1_BLOCK_SIZE (1024ULL * 1024 * 1024) /* 1 GB */ + +static void mmu_build_tables(void) +{ + uint64_t addr; + uint64_t dma_lo; + uint64_t dma_hi; + int i; + + /* L2_DDR: 512 entries covering 0..1 GB at 2 MB each. */ + dma_lo = (uint64_t)(uintptr_t)_dma_buffers_start; + dma_hi = (uint64_t)(uintptr_t)_dma_buffers_end; + for (i = 0; i < 512; i++) { + addr = (uint64_t)i * L2_BLOCK_SIZE; + if ((addr + L2_BLOCK_SIZE) <= dma_lo || addr >= dma_hi) { + L2_DDR[i] = BLOCK_NORMAL(addr); + } else { + /* Any 2 MB block overlapping the GEM DMA region is mapped + * Normal-NC, in BOTH layouts. The 8-byte BDs share 64-byte + * cache lines, so cleaning one BD's line writes neighbouring + * BDs back over MAC-set OWN/USED bits (Skoll HIGH-2). A + * cacheable ring with per-BD DC ops therefore corrupts the + * ring under sustained RX (TCP-rate) and wedges it; NC makes + * the rings DMA-coherent with no cache maintenance. NC also + * keeps the wrapped word-wise memcpy/memset (aligned 64-bit + + * byte tail) from faulting when staging frames. In the DDR + * layout this block may also back part of the stack; the + * uncached cost there is acceptable for correctness. */ + L2_DDR[i] = BLOCK_NORMAL_NC(addr); + } + } + + /* L2_PERIPH: 3..4 GB range. All Device-nGnRnE except the last + * 2 MB block which contains OCM (0xFFFC0000..0xFFFFFFFF) and + * must be Normal+executable so we can fetch our code from OCM. */ + for (i = 0; i < 511; i++) { + addr = 3ULL * L1_BLOCK_SIZE + (uint64_t)i * L2_BLOCK_SIZE; + L2_PERIPH[i] = BLOCK_DEVICE(addr); + } + /* Entry 511 covers 0xFFE00000..0xFFFFFFFF, containing OCM + * (0xFFFC0000+). */ +#ifdef AMD_LAYOUT_DDR + /* DDR layout: code+data live in DDR; OCM here holds only the reset + * vectors. Keep it Normal-WB cacheable, executable. */ + L2_PERIPH[511] = BLOCK_NORMAL(3ULL * L1_BLOCK_SIZE + + 511ULL * L2_BLOCK_SIZE); +#else + /* OCM layout: text, data and the GEM BD rings/buffers all live in OCM. + * Map the OCM block Normal Non-Cacheable (still executable) so the + * rings are DMA-coherent without per-descriptor cache maintenance -- + * and so a cache-line clean can never write back a stale neighbour BD + * over a MAC-set OWN/USED bit (Skoll HIGH-2). Instruction fetch from + * NC Normal memory is permitted; OCM is single-cycle SRAM so the lost + * D-cache is not significant for this deterministic profile. */ + L2_PERIPH[511] = BLOCK_NORMAL_NC_EXEC(3ULL * L1_BLOCK_SIZE + + 511ULL * L2_BLOCK_SIZE); +#endif + + /* L1 entries. */ + L1[0] = TABLE_DESC((uintptr_t)L2_DDR); + L1[1] = BLOCK_NORMAL(L1_BLOCK_SIZE); /* 1..2 GB DDR */ + L1[2] = 0; /* 2..3 GB unused */ + L1[3] = TABLE_DESC((uintptr_t)L2_PERIPH); /* 3..4 GB peri + OCM */ + + for (i = 4; i < 512; i++) + L1[i] = 0; +} + +void mmu_enable(void) +{ + uint64_t mair; + uint64_t tcr; + uint64_t sctlr; + + mmu_build_tables(); + + /* Make sure the table writes are visible to the table walker + * before we point TTBR at them. We are still running with the + * D-cache off here, so a DSB SY is sufficient. */ + __asm__ volatile ("dsb sy" ::: "memory"); + + /* MAIR_EL3: + * ATTR0 = 0xFF (Normal WB Inner+Outer Cacheable) + * ATTR1 = 0x00 (Device-nGnRnE) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable, for DMA buffers) */ + mair = (0xFFULL << 0) | (0x00ULL << 8) | (0x44ULL << 16); + __asm__ volatile ("msr mair_el3, %0" :: "r"(mair)); + + /* TCR_EL3: 32-bit VA (T0SZ=32, start level L1), 4 KB granule, + * IRGN0=WB-RA-WA, ORGN0=WB-RA-WA, SH0=Inner shareable, IPS=40 bit. + * EL3 TCR has T0SZ at bits [5:0], IRGN0[9:8], ORGN0[11:10], + * SH0[13:12], TG0[15:14], PS[18:16], TBI[20], RES1 at bit 23,31. + */ + tcr = (uint64_t)32 /* T0SZ = 32 -> 4 GB VA */ + | ((uint64_t)1 << 8) /* IRGN0 = WB RA-WA */ + | ((uint64_t)1 << 10) /* ORGN0 = WB RA-WA */ + | ((uint64_t)3 << 12) /* SH0 = Inner shareable */ + | ((uint64_t)0 << 14) /* TG0 = 4 KB */ + | ((uint64_t)2 << 16) /* PS = 40 bit PA */ + | ((uint64_t)1 << 23) /* RES1 */ + | ((uint64_t)1 << 31); /* RES1 */ + __asm__ volatile ("msr tcr_el3, %0" :: "r"(tcr)); + + /* TTBR0_EL3 = &L1. */ + __asm__ volatile ("msr ttbr0_el3, %0" :: "r"((uint64_t)(uintptr_t)L1)); + + __asm__ volatile ("isb" ::: "memory"); + + /* Invalidate TLBs and I-cache before turning the MMU on. */ + __asm__ volatile ("tlbi alle3" ::: "memory"); + __asm__ volatile ("ic iallu" ::: "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); + __asm__ volatile ("isb" ::: "memory"); + + /* Enable MMU + I-cache + D-cache. Cache coherency with GEM DMA + * is handled with explicit DC CVAC / DC IVAC ops in eth_send and + * eth_poll (see gem.c cache_*() helpers). + * + * DZE bit 14 = enable DC ZVA at EL0/EL1 (and EL3 since we are + * here). Newlib aarch64 memset uses DC ZVA for fast bulk zero + * writes; without DZE=1 the instruction traps UNDEF and the + * exception loop wedges the CPU. */ + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + sctlr |= (1ULL << 0); /* M */ + sctlr |= (1ULL << 2); /* C */ + sctlr |= (1ULL << 12); /* I */ + sctlr |= (1ULL << 14); /* DZE - allow DC ZVA */ + sctlr &= ~(1ULL << 1); /* A off */ + __asm__ volatile ("msr sctlr_el3, %0" :: "r"(sctlr)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/amd/arch/aarch64/startup_aarch64.S b/src/port/amd/arch/aarch64/startup_aarch64.S new file mode 100644 index 00000000..920497de --- /dev/null +++ b/src/port/amd/arch/aarch64/startup_aarch64.S @@ -0,0 +1,289 @@ +/* startup.S + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * AArch64 EL3 entry shared by the ZynqMP (Cortex-A53) and Versal + * (Cortex-A72) boards. A loader (FSBL / PLM->BL31 / wolfBoot) drops us + * at EL3 with caches/MMU off (we force them off again to be safe) and + * branches to _start. The early UART breadcrumb offset differs per UART + * IP (Cadence TX FIFO at 0x30, PL011 DR at 0x00); the board Makefile + * passes it as UART_EARLY_TX_OFF. + */ +#ifndef UART_EARLY_TX_OFF +#define UART_EARLY_TX_OFF 0x30 /* Cadence TX FIFO; PL011 boards pass 0x00 */ +#endif + + /* A loader (FSBL, wolfBoot, ...) that respects the ELF entry + * point in the program header branches to _start directly. A + * loader that simply branches to the first byte of the binary + * (wolfBoot's do_boot path does this -- it uses LOAD_ADDRESS, + * not the ELF entry) lands on this stub instead, which forwards + * to _start. Linker scripts place this section first in the + * output image so it always sits at offset 0 of the binary. */ + .section .boot_entry, "ax" + .global _boot_entry +_boot_entry: + b _start + + .section .vectors, "ax" + .align 11 /* 2048 byte align required by VBAR */ + .global _vectors +_vectors: + /* Current EL with SP0 (not used; we always use SPx). */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler (see note below) */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Current EL with SPx */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler; the GIC in + * secure mode may deliver Group 0 + * interrupts via nFIQ depending on + * GICC_CTLR.FIQEn. We route both to + * the same trampoline so the C + * dispatcher sees the INTID either + * way. */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Lower EL using AArch64 (unused, we stay at EL3) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + /* Lower EL using AArch32 (unused) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + + .section .text, "ax" + .global _start + .type _start, %function +_start: + /* VERY FIRST INSTRUCTIONS - prove we're running. Write '@' to + * UART0 TX FIFO. No register-bank-dependent ops here. */ + mov x1, #0xFF000000 + mov w2, #'@' + str w2, [x1, #UART_EARLY_TX_OFF] + str w2, [x1, #UART_EARLY_TX_OFF] + str w2, [x1, #UART_EARLY_TX_OFF] + + /* Make sure we are on A53-0. If FSBL released us as A53-1/2/3 by + * accident, park them. */ + mrs x0, mpidr_el1 + and x0, x0, #0xff /* Aff0 */ + cbnz x0, _park_secondary + + /* Disable MMU + caches in case FSBL left them on. */ + mrs x0, sctlr_el3 + bic x0, x0, #(1 << 0) /* M - MMU off */ + bic x0, x0, #(1 << 2) /* C - D-cache off */ + bic x0, x0, #(1 << 12) /* I - I-cache off */ + msr sctlr_el3, x0 + isb + + /* Allow FP/SIMD at EL3 (FSBL does this too, but be explicit). */ + msr cptr_el3, xzr + + /* Force SPSel = 1 (use SP_ELx). The IRQ vector at offset 0x280 + * (Current EL with SPx) is what we wired el3_irq_trampoline to. + * FSBL may have left SPSel at 0 (SP_EL0); fix it deterministically. */ + msr spsel, #1 + + /* Force SCR_EL3 to a known state. We run entirely at EL3 in + * Secure world. The wolfBoot AArch64 startup explicitly sets the + * IRQ + FIQ + EA routing bits even though the ARM ARM says they + * only affect lower-EL interrupts; reusing that convention here + * because empirically the A53 does not enter the IRQ exception + * unless these are set (ISR_EL1.I went high but no exception + * fired with these bits clear). + * bit 0 NS = 0 (stay Secure) + * bit 1 IRQ = 1 (route IRQ to EL3) + * bit 2 FIQ = 1 (route FIQ to EL3) + * bit 3 EA = 1 (route SError/abort to EL3) + * bit 10 RW = 0 (no lower EL64; we never drop to lower EL) */ + mov x0, #((1 << 1) | (1 << 2) | (1 << 3)) + msr scr_el3, x0 + isb + + /* Vector base. */ + adrp x0, _vectors + add x0, x0, :lo12:_vectors + msr vbar_el3, x0 + + /* Stack pointer. After 'msr spsel, #1' this writes SP_EL3. */ + ldr x0, =_stack_top + mov sp, x0 + + /* Very early UART poke - one char before any C code, so even if a + * later step hangs we know _start was reached. Writes '!' to + * UART0 TX FIFO (0xFF000030). Assumes FSBL/psu_init already + * configured UART0 baud. */ + mov x1, #0xFF000000 + mov w2, #'!' + str w2, [x1, #UART_EARLY_TX_OFF] + + /* Clear BSS. */ + ldr x0, =_sbss + ldr x1, =_ebss +1: cmp x0, x1 + b.ge 2f + str xzr, [x0], #8 + b 1b +2: + /* Breadcrumb: BSS cleared. */ + mov x1, #0xFF000000 + mov w2, #'B' + str w2, [x1, #UART_EARLY_TX_OFF] + + /* Bring up the MMU + caches. C function in mmu.c. */ + bl mmu_enable + + /* Breadcrumb: MMU enabled. */ + mov x1, #0xFF000000 + mov w2, #'M' + str w2, [x1, #UART_EARLY_TX_OFF] + + /* Branch to main. */ + bl main + + /* main() should not return. If it does, hang. */ + b _hang + + .type _park_secondary, %function +_park_secondary: + wfe + b _park_secondary + + .global _hang + .type _hang, %function +_hang: + b _hang + +/* --------------------------------------------------------------------- + * IRQ trampoline. EL3 IRQ vector -> save GP regs, call C handler, restore. + * Keeps the C handler clean and avoids __attribute__((interrupt)) tricks + * which are not reliable on aarch64. + * ------------------------------------------------------------------- */ + .type el3_irq_trampoline, %function +el3_irq_trampoline: + /* Save full integer register file (x0-x30) plus SPSR_EL3/ELR_EL3. + * Frame is 18 * 16 = 288 bytes (16-byte aligned). Callee-saved + * x19-x28 must be preserved too: irq_dispatch is an ordinary C + * function and may clobber them, while the interrupted code + * almost certainly relies on them. */ + sub sp, sp, #(18 * 16) + stp x0, x1, [sp, #(0 * 16)] + stp x2, x3, [sp, #(1 * 16)] + stp x4, x5, [sp, #(2 * 16)] + stp x6, x7, [sp, #(3 * 16)] + stp x8, x9, [sp, #(4 * 16)] + stp x10, x11, [sp, #(5 * 16)] + stp x12, x13, [sp, #(6 * 16)] + stp x14, x15, [sp, #(7 * 16)] + stp x16, x17, [sp, #(8 * 16)] + stp x18, x19, [sp, #(9 * 16)] + stp x20, x21, [sp, #(10 * 16)] + stp x22, x23, [sp, #(11 * 16)] + stp x24, x25, [sp, #(12 * 16)] + stp x26, x27, [sp, #(13 * 16)] + stp x28, x29, [sp, #(14 * 16)] + str x30, [sp, #(15 * 16)] + /* Snapshot exception return state in case irq_dispatch (or any + * nested exception inside it) clobbers SPSR_EL3 / ELR_EL3. */ + mrs x0, spsr_el3 + mrs x1, elr_el3 + stp x0, x1, [sp, #(16 * 16)] + + bl irq_dispatch + + ldp x0, x1, [sp, #(16 * 16)] + msr spsr_el3, x0 + msr elr_el3, x1 + ldp x0, x1, [sp, #(0 * 16)] + ldp x2, x3, [sp, #(1 * 16)] + ldp x4, x5, [sp, #(2 * 16)] + ldp x6, x7, [sp, #(3 * 16)] + ldp x8, x9, [sp, #(4 * 16)] + ldp x10, x11, [sp, #(5 * 16)] + ldp x12, x13, [sp, #(6 * 16)] + ldp x14, x15, [sp, #(7 * 16)] + ldp x16, x17, [sp, #(8 * 16)] + ldp x18, x19, [sp, #(9 * 16)] + ldp x20, x21, [sp, #(10 * 16)] + ldp x22, x23, [sp, #(11 * 16)] + ldp x24, x25, [sp, #(12 * 16)] + ldp x26, x27, [sp, #(13 * 16)] + ldp x28, x29, [sp, #(14 * 16)] + ldr x30, [sp, #(15 * 16)] + add sp, sp, #(18 * 16) + eret + + .global irq_enable + .type irq_enable, %function +irq_enable: + msr daifclr, #3 /* unmask IRQ (bit 1) + FIQ (bit 0) */ + ret + + .global irq_disable + .type irq_disable, %function +irq_disable: + msr daifset, #2 /* mask IRQ */ + ret + +/* --------------------------------------------------------------------- + * Synchronous exception handler - print ESR_EL3 / ELR_EL3 / FAR_EL3 + * then hang. Anything that previously fell to _hang silently (alignment + * fault, translation fault, undefined instruction) now produces a + * UART dump. + * ------------------------------------------------------------------- */ + .type el3_sync_trampoline, %function +el3_sync_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + bl exception_report + b _hang + + .type el3_serror_trampoline, %function +el3_serror_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + mov x4, #1 /* indicate SError to C */ + bl exception_report_serror + b _hang diff --git a/src/port/amd/arch/aarch64/timer.h b/src/port/amd/arch/aarch64/timer.h new file mode 100644 index 00000000..8f7e8c7a --- /dev/null +++ b/src/port/amd/arch/aarch64/timer.h @@ -0,0 +1,50 @@ +/* timer.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * AArch64 generic-timer based delay helpers (Cortex-A53/A72). The PLM/ + * FSBL/ATF programs CNTFRQ_EL0; we fall back to 100 MHz if it reads 0. + * + * CNTPCT_EL0/CNTVCT_EL0 are readable at every EL without trap setup. + */ +#ifndef AMD_TIMER_H +#define AMD_TIMER_H + +#include + +static inline uint64_t timer_now(void) +{ + uint64_t v; + __asm__ volatile ("isb; mrs %0, cntpct_el0" : "=r"(v) :: "memory"); + return v; +} + +static inline uint32_t timer_freq(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntfrq_el0" : "=r"(v)); + return v ? (uint32_t)v : 100000000u; +} + +/* Free-running 64-bit counter for the entropy source (virtual count, + * readable at every EL). */ +static inline uint64_t arch_counter64(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntvct_el0" : "=r"(v)); + return v; +} + +static inline void delay_us(uint32_t us) +{ + uint64_t start = timer_now(); + uint64_t target = ((uint64_t)us * (uint64_t)timer_freq()) / 1000000ULL; + while ((timer_now() - start) < target) { } +} + +static inline void delay_ms(uint32_t ms) +{ + delay_us(ms * 1000u); +} + +#endif /* AMD_TIMER_H */ diff --git a/src/port/amd/arch/armv7/cache.h b/src/port/amd/arch/armv7/cache.h new file mode 100644 index 00000000..76a458c3 --- /dev/null +++ b/src/port/amd/arch/armv7/cache.h @@ -0,0 +1,45 @@ +/* cache.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * ARMv7-A (Cortex-A9) cache maintenance for GEM DMA coherency. The A9 + * L1 cache line is 32 bytes (NOT the 64 bytes of the AArch64 cores this + * driver is shared with); the stride below must match or the ops skip + * lines. In this port the GEM rings/buffers live in OCM which the MMU + * maps non-cacheable (see mmu_armv7.c), so these are effectively no-ops, + * but they stay correct for a cacheable layout. + */ +#ifndef AMD_CACHE_H +#define AMD_CACHE_H + +#include + +#define CACHE_LINE 32u + +static inline void cache_clean(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + /* ARMv7 DCCMVAC (Clean Data cache by MVA to PoC): + * MCR p15, 0, Rt, c7, c10, 1 */ + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("mcr p15, 0, %0, c7, c10, 1" :: "r"(a) : "memory"); + __asm__ volatile ("dsb" ::: "memory"); +} + +static inline void cache_inval(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + /* ARMv7 DCIMVAC (Invalidate Data cache by MVA to PoC): + * MCR p15, 0, Rt, c7, c6, 1 */ + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("mcr p15, 0, %0, c7, c6, 1" :: "r"(a) : "memory"); + __asm__ volatile ("dsb" ::: "memory"); +} + +#endif /* AMD_CACHE_H */ diff --git a/src/port/amd/arch/armv7/mmu_armv7.c b/src/port/amd/arch/armv7/mmu_armv7.c new file mode 100644 index 00000000..0307636c --- /dev/null +++ b/src/port/amd/arch/armv7/mmu_armv7.c @@ -0,0 +1,147 @@ +/* mmu.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARMv7-A short-format L1 page tables (4 KB-aligned, 16 KB total, + * 4096 1 MB section descriptors covering the whole 32-bit VA space). + * Sufficient for a flat-mapped bare-metal app on Zynq-7000 PS. + * + * 0x00000000 - 0x3FFFFFFF DDR (1 GB) - Normal WB cacheable + * 0x40000000 - 0xDFFFFFFF unmapped (PL interconnect / SMC ranges) + * 0xE0000000 - 0xFFFFFFFF PS peripherals (UART, GEM, SLCR, GIC, OCM) + * mostly Device-nGnRnE, except the OCM + * high-mapping (0xFFFC0000..0xFFFFFFFF) + * which is Normal Non-Cacheable executable + * (SEC_NORMAL_NC) for GEM DMA coherency. + * + * Short descriptor section format (1 MB, supersection ignored): + * bits [31:20] = section base + * bits [19:18] = NS, NS for non-secure (we are Secure -> 0) + * bit [17] = nG (global) + * bit [16] = S (shareable) + * bits [14:12] = TEX + * bits [11:10] = AP[1:0] + * bit [15] = AP[2] + * bit [9] = IMPDEF + * bits [8:5] = Domain + * bit [4] = XN (execute never) + * bit [3] = C + * bit [2] = B + * bits [1:0] = 10 (section) + * + * TEX[2:0] + C + B encoding for Normal WB cacheable (TEX=001, C=1, B=1) + * and Shareable Device (TEX=000, C=0, B=1) per ARMv7-A short descriptor. + * + * Brought up on a ZC702 (Cortex-A9). + */ +#include +#include "mmu.h" + +static volatile uint32_t L1[4096] __attribute__((aligned(16384), + section(".page_tables"))); + +#define SEC_NORMAL_WB(addr) \ + (((addr) & 0xFFF00000u) | \ + (1u << 12) | /* TEX[0] = 1 */ \ + (1u << 10) | /* AP[1] = 1 (RW PL1+) */ \ + (1u << 3) | /* C */ \ + (1u << 2) | /* B */ \ + 0x2u) /* section */ + +#define SEC_DEVICE(addr) \ + (((addr) & 0xFFF00000u) | \ + (1u << 10) | /* AP[1] */ \ + (1u << 4) | /* XN */ \ + (1u << 2) | /* B (shareable device) */ \ + 0x2u) + +/* Normal, Non-cacheable, executable (TEX=001, C=0, B=0). Used for the + * OCM section so the GEM DMA descriptor rings and frame buffers (which + * live in OCM) are coherent with the Cortex-A9 without per-descriptor + * cache maintenance. The 8-byte GEM BDs otherwise share 32-byte cache + * lines (4 BDs per line), and cleaning one BD's line read-modify-writes + * the MAC's OWN bits on the neighbours, stalling RX. Code still executes + * from this region (not XN); it is just slower than cached. */ +#define SEC_NORMAL_NC(addr) \ + (((addr) & 0xFFF00000u) | \ + (1u << 12) | /* TEX[0] = 1 -> TEX=001 (Normal) */ \ + (1u << 10) | /* AP[1] = 1 (RW PL1+) */ \ + 0x2u) /* C=0, B=0 -> non-cacheable; section */ + +#define SEC_INVALID (0u) + +extern uint8_t _dma_buffers_start[]; +extern uint8_t _dma_buffers_end[]; + +static void mmu_build_tables(void) +{ + uint32_t i; + uint32_t addr; + + for (i = 0; i < 4096; i++) + L1[i] = SEC_INVALID; + + /* DDR 0x00000000 - 0x3FFFFFFF (1 GB) as Normal WB. */ + for (i = 0; i < 1024; i++) { + addr = i * 0x100000u; + L1[i] = SEC_NORMAL_WB(addr); + } + + /* PS peripherals at 0xE0000000 - 0xFEFFFFFF (Device). */ + for (i = 0xE00; i < 0xFF0; i++) { + addr = i * 0x100000u; + L1[i] = SEC_DEVICE(addr); + } + + /* OCM high mapping 0xFFFC0000 - 0xFFFFFFFF (last 256 KB of 4 GB). + * The section at 0xFFF00000 (1 MB) covers it. Mark Normal + * NON-cacheable but executable: the whole app (code, data, stack and + * the GEM DMA rings/buffers) lives in OCM, and the descriptors must be + * non-cacheable for DMA coherency (see SEC_NORMAL_NC). */ + L1[0xFFF] = SEC_NORMAL_NC(0xFFF00000u); +} + +void mmu_enable(void) +{ + uint32_t sctlr; + + mmu_build_tables(); + + /* DACR: domain 0 = Client (check permissions). */ + __asm__ volatile ("mcr p15, 0, %0, c3, c0, 0" :: "r"(0x55555555u)); + + /* TTBR0 = L1 (low 32 bits of physical address). TTBR1 unused. */ + __asm__ volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r"((uint32_t)L1)); + __asm__ volatile ("mcr p15, 0, %0, c2, c0, 2" :: "r"(0u)); /* TTBCR=0 */ + + /* Invalidate TLB + I-cache. */ + __asm__ volatile ("mcr p15, 0, %0, c8, c7, 0" :: "r"(0u)); /* TLBIALL */ + __asm__ volatile ("mcr p15, 0, %0, c7, c5, 0" :: "r"(0u)); /* ICIALLU */ + __asm__ volatile ("dsb" ::: "memory"); + __asm__ volatile ("isb" ::: "memory"); + + /* Enable MMU + I-cache + D-cache. */ + __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0" : "=r"(sctlr)); + sctlr |= (1u << 0); /* M */ + sctlr |= (1u << 2); /* C */ + sctlr |= (1u << 12); /* I */ + sctlr &= ~(1u << 1); /* A off */ + __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0" :: "r"(sctlr)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/amd/arch/armv7/startup_armv7.S b/src/port/amd/arch/armv7/startup_armv7.S new file mode 100644 index 00000000..e3290aed --- /dev/null +++ b/src/port/amd/arch/armv7/startup_armv7.S @@ -0,0 +1,155 @@ +/* startup.S + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARMv7-A entry for Cortex-A9 on Xilinx Zynq-7000. Stock Xilinx FSBL + * configures DDR, clocks, MIO, then jumps to _start at the image + * entry. We bring up the SVC-mode stack, install the exception + * vector table, clear BSS, and call mmu_enable + main. + * + * Brought up on a ZC702 (Cortex-A9): the ARMv7 mode switching, + * exception vectors and MMU enable below run on real hardware. + */ + .arch armv7-a + .syntax unified + + .section .vectors, "ax" + .global _vectors +_vectors: + b _start /* 0x00: reset */ + b _undef_handler /* 0x04: undefined instruction */ + b _svc_handler /* 0x08: SVC */ + b _prefetch_abort /* 0x0C: prefetch abort */ + b _data_abort /* 0x10: data abort */ + nop /* 0x14: reserved */ + b _irq_handler /* 0x18: IRQ */ + b _fiq_handler /* 0x1C: FIQ */ + + .section .text, "ax" + .global _start + .type _start, %function + .arm +_start: + /* Disable IRQ + FIQ until we have a stack. */ + cpsid if + + /* Switch to SVC mode (FSBL may have left us in another mode). */ + cps #0x13 /* SVC mode */ + + /* Set the SVC-mode stack. */ + ldr sp, =_stack_top + + /* Install vector base address (VBAR). VBAR is implemented on + * Cortex-A9 (CP15 c12 c0 0). */ + ldr r0, =_vectors + mcr p15, 0, r0, c12, c0, 0 /* MCR VBAR, r0 */ + isb + + /* Disable MMU + caches in case FSBL left them on. SCTLR is CP15 + * c1 c0 0. */ + mrc p15, 0, r0, c1, c0, 0 + bic r0, r0, #(1 << 0) /* M -- MMU disable */ + bic r0, r0, #(1 << 2) /* C -- D-cache disable */ + bic r0, r0, #(1 << 12) /* I -- I-cache disable */ + mcr p15, 0, r0, c1, c0, 0 + isb + + /* Very-early UART poke - one char before any C code. The ZC702 + * routes the USB console to Cadence UART1; its TX FIFO is at + * 0xE0001030. */ + ldr r1, =0xE0001000 + mov r2, #'!' + str r2, [r1, #0x30] + + /* Clear BSS. */ + ldr r0, =_sbss + ldr r1, =_ebss + mov r2, #0 +1: cmp r0, r1 + bge 2f + str r2, [r0], #4 + b 1b +2: + + /* Bring up the MMU + caches (C function in mmu.c). */ + bl mmu_enable + + /* Breadcrumb: MMU enabled. */ + ldr r1, =0xE0001000 + mov r2, #'M' + str r2, [r1, #0x30] + + /* Call main(). */ + bl main + + /* main() should not return. */ + b _hang + + .global _hang + .type _hang, %function +_hang: + b _hang + +/* ---------------------------------------------------------------------- + * Exception handlers. The IRQ vector funnels into irq_dispatch (C), + * mirroring the AArch64 trampoline in src/port/zcu102/startup.S. We + * save AAPCS caller-saved + lr to make irq_dispatch safe to call. + * -------------------------------------------------------------------- */ + .type _irq_handler, %function +_irq_handler: + /* Adjust lr_irq for return-from-exception (LR points past). */ + sub lr, lr, #4 + srsdb sp!, #0x13 /* save lr_irq, spsr_irq to SVC stack */ + cpsid if, #0x13 /* switch to SVC mode for the handler */ + push {r0-r12, lr} + bl irq_dispatch + pop {r0-r12, lr} + rfeia sp! /* return from exception */ + + .type _fiq_handler, %function +_fiq_handler: + b _hang /* FIQ unused */ + + .type _undef_handler, %function +_undef_handler: + b _hang + + .type _svc_handler, %function +_svc_handler: + b _hang + + .type _prefetch_abort, %function +_prefetch_abort: + b _hang + + .type _data_abort, %function +_data_abort: + b _hang + + .global irq_enable + .type irq_enable, %function +irq_enable: + cpsie if /* enable IRQ + FIQ */ + bx lr + + .global irq_disable + .type irq_disable, %function +irq_disable: + cpsid if + bx lr diff --git a/src/port/amd/arch/armv7/timer.h b/src/port/amd/arch/armv7/timer.h new file mode 100644 index 00000000..7d119ada --- /dev/null +++ b/src/port/amd/arch/armv7/timer.h @@ -0,0 +1,72 @@ +/* timer.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * Cortex-A9 (Zynq-7000) delay helpers. + * + * The Cortex-A9 does NOT implement the ARMv7 generic timer + * (CNTPCT/CNTFRQ via CP15 c14) -- those encodings are UNDEFINED on this + * core and trap to the undefined-instruction vector. The MPCore provides + * a 64-bit free-running Global Timer at PERIPHBASE+0x200 instead. It is + * clocked at PERIPHCLK = CPU_3x2x = 333.33 MHz on the ZC702 with the + * default FSBL clock plan (ARM_PLL 1.333 GHz, CPU_6x4x 666.67 MHz). + * Override Z7_GTIMER_FREQ_HZ if you reclock the CPU. + */ +#ifndef AMD_TIMER_H +#define AMD_TIMER_H + +#include + +#define Z7_GTIMER_BASE 0xF8F00200u +#define Z7_GTIMER_LO (*(volatile uint32_t *)(Z7_GTIMER_BASE + 0x00)) +#define Z7_GTIMER_HI (*(volatile uint32_t *)(Z7_GTIMER_BASE + 0x04)) +#define Z7_GTIMER_CTRL (*(volatile uint32_t *)(Z7_GTIMER_BASE + 0x08)) +#define Z7_GTIMER_CTRL_EN 0x00000001u + +#ifndef Z7_GTIMER_FREQ_HZ +#define Z7_GTIMER_FREQ_HZ 333333333u +#endif + +static inline uint64_t timer_now(void) +{ + uint32_t hi1, lo, hi2; + + /* Enable the Global Timer once if the FSBL left it stopped. */ + if ((Z7_GTIMER_CTRL & Z7_GTIMER_CTRL_EN) == 0) + Z7_GTIMER_CTRL = Z7_GTIMER_CTRL_EN; + + /* Read high, low, high again; retry if a wrap happened mid-read. */ + do { + hi1 = Z7_GTIMER_HI; + lo = Z7_GTIMER_LO; + hi2 = Z7_GTIMER_HI; + } while (hi1 != hi2); + + return ((uint64_t)hi2 << 32) | (uint64_t)lo; +} + +static inline uint32_t timer_freq(void) +{ + return Z7_GTIMER_FREQ_HZ; +} + +/* Free-running 64-bit counter for the entropy source (MPCore global + * timer; the A9 has no ARMv7 generic timer). */ +static inline uint64_t arch_counter64(void) +{ + return timer_now(); +} + +static inline void delay_us(uint32_t us) +{ + uint64_t start = timer_now(); + uint64_t target = ((uint64_t)us * (uint64_t)timer_freq()) / 1000000ULL; + while ((timer_now() - start) < target) { } +} + +static inline void delay_ms(uint32_t ms) +{ + delay_us(ms * 1000u); +} + +#endif /* AMD_TIMER_H */ diff --git a/src/port/amd/boards/versal/.gitignore b/src/port/amd/boards/versal/.gitignore new file mode 100644 index 00000000..c6cf5083 --- /dev/null +++ b/src/port/amd/boards/versal/.gitignore @@ -0,0 +1,5 @@ +*.o +*.elf +*.bin +BOOT.BIN +.layout_stamp diff --git a/src/port/amd/boards/versal/Makefile b/src/port/amd/boards/versal/Makefile new file mode 100644 index 00000000..0f23141e --- /dev/null +++ b/src/port/amd/boards/versal/Makefile @@ -0,0 +1,119 @@ +# Xilinx Versal Gen 1 (VMK180, Cortex-A72) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=aarch64-none-elf- +# +# Toolchain: ARM GNU aarch64-none-elf-gcc (tested with 14.3.rel1). +# +# Brought up on a VMK180 (Cortex-A72): DHCP, ICMP ping and UDP echo work. + +CROSS_COMPILE ?= aarch64-none-elf- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../../../.. +AMD := $(ROOT)/src/port/amd +COMMON := $(AMD)/common +IP := $(AMD)/ip +ARCH := $(AMD)/arch/aarch64 + +# Cortex-A72, AArch64, EL3 single-EL bare-metal. No SIMD/FP in the +# wolfIP/driver paths - keep -mgeneral-regs-only to catch any +# accidental FP use and make the ABI deterministic for cert. +CFLAGS := -mcpu=cortex-a72 -mgeneral-regs-only +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(COMMON) -I$(ARCH) -I$(IP) -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DVERSAL -DXILINX_AARCH64 +# Append extra defines for investigation builds, e.g.: +# make CFLAGS_EXTRA="-DDEBUG_GEM -DDEBUG_PHY" +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a72 -DUART_EARLY_TX_OFF=0x00 + +# Layout selector. Default ocm keeps the OCM-only layout that the JTAG +# iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass +# LAYOUT=ddr to relink for DDR @ 0x10000000 -- which is what wolfBoot +# uses (WOLFBOOT_LOAD_ADDRESS in zynqmp.config also applies to Versal +# when adapted). +LAYOUT ?= ocm +ifeq ($(LAYOUT),ddr) + LDSCRIPT := target_ddr.ld + CFLAGS += -DAMD_LAYOUT_DDR +else ifeq ($(LAYOUT),ocm) + LDSCRIPT := target.ld + CFLAGS += -DAMD_LAYOUT_OCM +else + $(error LAYOUT must be 'ocm' or 'ddr') +endif + +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Replace newlib's aarch64 memset/memcpy (which use 'dc zva' and may +# hang on a similar Cortex-A72 setup; the safe pattern is to override +# them as we did on ZCU102). +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := app.c board.c uart_pl011.c uart_util.c mmu_aarch64.c gic_gicv3.c \ + gem_core.c board_gem.c gem_swq.c gem_rx_swq_poll.c \ + phy_dp83867.c phy_dispatch_dp83867.c entropy.c exception_aarch64.c +LOCAL_S := startup_aarch64.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +# Shared sources live outside this board dir; find them by vpath so the +# .o files still land here (keeps clean + JTAG app.elf-in-place working). +vpath %.c $(COMMON):$(ARCH):$(IP) +vpath %.S $(ARCH) + +# Keep 'all' the default goal even though the layout-stamp rules below +# are defined before it. +.DEFAULT_GOAL := all + +# A change in LAYOUT must force a full rebuild: OCM objects link against +# 0xFFFC0000 and a DDR build against 0x10000000, so reusing stale objects +# across a layout switch silently produces a wrong image. The stamp +# records the last LAYOUT; its mtime only bumps when LAYOUT actually +# changes, so same-layout incremental builds are unaffected. +LAYOUT_STAMP := .layout_stamp +.PHONY: FORCE +FORCE: +$(LAYOUT_STAMP): FORCE + @if [ "`cat $@ 2>/dev/null`" != "$(LAYOUT)" ]; then \ + echo "LAYOUT -> $(LAYOUT) (was `cat $@ 2>/dev/null`); forcing rebuild"; \ + echo "$(LAYOUT)" > $@; \ + fi +$(OBJS): $(LAYOUT_STAMP) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJS) app.elf BOOT.BIN $(LAYOUT_STAMP) + +.PHONY: all clean help + +help: + @echo "Versal Gen 1 wolfIP build (Cortex-A72):" + @echo " make - build app.elf (OCM layout)" + @echo " make LAYOUT=ddr - DDR layout for wolfBoot" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/amd/boards/versal/README.md b/src/port/amd/boards/versal/README.md new file mode 100644 index 00000000..7a4ed5c7 --- /dev/null +++ b/src/port/amd/boards/versal/README.md @@ -0,0 +1,78 @@ +# wolfIP port: Xilinx Versal Gen 1 (VMK180) + +**STATUS: brought up on a VMK180.** DHCP, ICMP ping and the UDP echo demo all work on real hardware (Cortex-A72 EL3, GEM0 + DP83867). See "Hardware bring-up notes" below for the Versal-specific differences. + +## What this port is + +Bare-metal wolfIP port for the AMD/Xilinx Versal ACAP Gen 1, demoed on the VMK180 dev board. Cortex-A72 APU 0 at EL3, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS. Targets the same deterministic UDP/IPv4 profile as the ZCU102 port for DO-178C DAL-C qualification. + +## What differs from ZCU102 + +| Subsystem | ZCU102 | Versal Gen 1 | Where it lives | +|-----------|--------|--------------|----------------| +| APU core | Cortex-A53 | Cortex-A72 | `Makefile` (-mcpu) | +| Bootloader handoff | FSBL -> EL3 | PLM -> BL31 -> EL3 (or EL2) | `startup_aarch64.S` | +| GIC | GIC-400 (GICv2) | GIC-600 (GICv3) | `gic_gicv3.c` rewritten for GICv3 system regs + GICR | +| UART | Cadence | ARM PL011 | `uart_pl011.c` rewritten | +| GEM count | 4 (GEM0-3) | 2 (GEM0-1) | `board.h` | +| On-board RJ45 | GEM3 (INTID 95) | GEM0 (INTID 88) | `board.h` | +| GEM IP | Cadence GEM3 | Cadence GEM3 | `gem_core.c` unchanged (just base addr / INTID) | +| PHY | DP83867 RGMII | DP83867 RGMII (VMK180) | `phy_dp83867.c` unchanged | +| MMU | EL3 ARMv8 | EL3 ARMv8 | `mmu_aarch64.c` unchanged | +| RNG | memuse entropy | memuse entropy | `entropy.c` unchanged | + +The reused 90% (`gem_core.c`, `phy_dp83867.c`, `mmu_aarch64.c`, `entropy.c`, `app.c`, `target.ld`, `target_ddr.ld`) is identical to the ZCU102 port; only `board.h`, `uart_pl011.c`, `gic_gicv3.c`, and the startup/Makefile breadcrumbs are Versal-specific. + +## Build + +``` +cd src/port/amd/boards/versal +make CROSS_COMPILE=aarch64-none-elf- # OCM layout (default) +make CROSS_COMPILE=aarch64-none-elf- LAYOUT=ddr # DDR layout for wolfBoot +``` + +Output: `app.elf`. Size info is printed at the end of the build. + +## JTAG boot (VMK180) + +The VMK180 must be in **JTAG boot mode** (SW1 mode pins = 0000) and +power-cycled so the BootROM does not auto-boot Linux from SD/QSPI -- a +booted Linux owns GEM0 and runtime-suspends its clock, which stalls the +bare-metal driver. Then: + +``` +XSDB=/opt/Xilinx//Vitis/bin/xsdb \ +BOOT_PDI=/path/to/vmk180_boot.pdi \ +./jtag/boot.sh +``` + +`jtag/boot.tcl` does `rst -system`, programs the boot PDI through the PMC +(the PLM brings up DDR/clocks/MIO and de-isolates the A72), then resets +A72 #0 (`-skip-activate-subsystem`, which lands at EL3) and loads +`app.elf`. The PS console is on FT4232 **interface 1** +(`VERSAL_VMK180_UART1`). + +## Hardware bring-up notes (what was Versal-specific) + +- **GEM RX is poll-driven.** The GICv3 CPU interface did not deliver the + GEM SPI in this EL3 bring-up, so `eth_poll` polls `gem_isr` from the + main loop to drain the RX ring (the IRQ path stays registered but + dormant). +- **GEM clock is owned by the PLM.** The CRL block is PMC/PLM-protected; + a direct APU write to `CRL.GEM0_REF_CTRL` (`0xFF5E0118`, not the ZynqMP + `+0x50`) stalls the bus. The PLM already configures the GEM clock, so + `gem_core.c` does not touch it. The correct Versal offsets are documented in + `board.h` for reference. +- **Two DP83867 PHYs.** The VMK180 presents more than one PHY on the MDIO + bus; `gem_core.c` scans all 32 addresses and prefers the one reporting copper + link (the on-board RJ45 PHY answered at addr 1). Make sure the cable is + in the **PS-GEM** RJ45, not the System Controller jack. +- `SCR_EL3` IRQ/FIQ/EA routing is carried over from the ZCU102 fix and is + harmless on the A72. + +## Files + +See `src/port/amd/README.md` for the shared-tree layout. This board dir +holds `board.h`, `board.c`, `board_gem.c`, `config.h`, the linker scripts +and `jtag/` (PDI-based JTAG loader). The differences listed in the table +above are the only substantive Versal-specific code. diff --git a/src/port/amd/boards/versal/board.c b/src/port/amd/boards/versal/board.c new file mode 100644 index 00000000..33de0680 --- /dev/null +++ b/src/port/amd/boards/versal/board.c @@ -0,0 +1,24 @@ +/* board.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Versal (VMK180) board hooks for the shared demo (app.c). + */ +#include "app.h" +#include "gic.h" + +const char *board_banner(void) +{ + return "\n\n=== wolfIP Versal Gen 1 (VMK180, Cortex-A72 EL3) ===\n" + "MMU on, caches on. Bringing up GIC-600 (GICv3)...\n"; +} + +void board_irq_setup(void) +{ + /* The GICv3 CPU interface did not deliver the GEM SPI in this EL3 + * bring-up (eth_poll drives gem_isr from the main loop instead), but + * we still unmask at the CPU defensively. */ + irq_enable(); +} diff --git a/src/port/amd/boards/versal/board.h b/src/port/amd/boards/versal/board.h new file mode 100644 index 00000000..b92b86e3 --- /dev/null +++ b/src/port/amd/boards/versal/board.h @@ -0,0 +1,119 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx Versal Gen 1 (VCxxxx / VMK180 board) PS register base + * addresses and GIC SPI IDs. Values are derived from the Versal ACAP + * Technical Reference Manual (AM011), the VMK180 board user guide, + * and the published `versal.dtsi` device tree. No Xilinx BSP header + * (xparameters.h) or xilstandalone code is referenced. + * + * Brought up on a VMK180 (Cortex-A72). The structure mirrors + * src/port/zcu102/. Key differences from ZynqMP are: + * - Cortex-A72 (not A53); the PLM -> TF-A (BL31) chain hands off at EL3 + * - GICv3 distributor + redistributor (no GICv2 legacy GICC) + * - ARM PL011 UART (not Cadence) + * - 2 GEMs (GEM0/GEM1) instead of 4; on-board RJ45 is GEM0 on VMK180 + */ +#ifndef VERSAL_BOARD_H +#define VERSAL_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (Versal PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x80000000UL /* 2 GB lower bank */ + +/* OCM on Versal lives at 0xFFFC0000 (256 KB). Same as ZynqMP. */ +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xFF000000UL /* PL011 */ +#define UART1_BASE 0xFF010000UL /* PL011 */ + +#define UART_BASE UART0_BASE /* console PL011 */ + +#define GEM0_BASE 0xFF0C0000UL /* on-board GEM (VMK180) */ + +/* On-board RJ45 is GEM0 on the VMK180. */ +#define GEM_BASE GEM0_BASE +#define IRQ_GEM IRQ_GEM0 +#define GEM1_BASE 0xFF0D0000UL + +#define CRL_APB_BASE 0xFF5E0000UL /* LPD clock & reset */ +#define IOU_SLCR_BASE 0xFF180000UL + +/* GICv3: distributor + redistributor */ +#define GICD_BASE 0xF9000000UL +#define GICR_BASE 0xF9080000UL /* per-CPU redistributors */ + +/* --------------------------------------------------------------------- + * GIC SPI numbers as GIC INTIDs (ARM GIC numbering: SPI N -> INTID 32+N). + * Versal versal.dtsi: + * GEM0: GIC_SPI 56 -> INTID 88 + * GEM1: GIC_SPI 58 -> INTID 90 + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 (32 + 56) /* GIC_SPI 56 -> INTID 88, + * on-board VMK180 RJ45 */ +#define IRQ_GEM1 (32 + 58) /* GIC_SPI 58 -> INTID 90 */ + +/* --------------------------------------------------------------------- + * CRL clock and reset registers (LPD). Versal's CRL register map is NOT + * the same as ZynqMP: the GEM clock/reset offsets differ. Verified + * against the Versal PSM firmware crl.h (Vitis embeddedsw): + * CRL.GEM0_REF_CTRL = CRL + 0x118 (CLKACT bit 25, DIVISOR0 [13:8], + * SRCSEL [2:0]) + * CRL.RST_GEM0 = CRL + 0x308 (RESET bit 0) + * ------------------------------------------------------------------- */ +#define CRL_APB_GEM0_REF_CTRL (CRL_APB_BASE + 0x118) /* Versal CRL.GEM0_REF_CTRL */ +#define CRL_GEM0_RST (CRL_APB_BASE + 0x308) /* Versal CRL.RST_GEM0 */ +#define CRL_GEM0_REF_CTRL_CLKACT (1u << 25) +#define CRL_RST_GEM0_RESET (1u << 0) + +/* --------------------------------------------------------------------- + * PL011 UART0 - on-board USB-UART on VMK180 + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* MAC address for eth0. Locally-administered, even first octet. */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* VERSAL_BOARD_H */ diff --git a/src/port/amd/boards/versal/board_gem.c b/src/port/amd/boards/versal/board_gem.c new file mode 100644 index 00000000..67ba682f --- /dev/null +++ b/src/port/amd/boards/versal/board_gem.c @@ -0,0 +1,30 @@ +/* board_gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Versal GEM clock/reset hooks for the shared GEM core. On Versal the GEM + * clock/reset live in the CRL block, owned by the PMC/PLM: a direct APU + * *write* to a protected CRL register (e.g. CRL.GEM0_REF_CTRL) stalls the + * bus and hangs the core. The PLM has already brought GEM0 out of reset + * and programmed its reference clock and MIO, so these hooks touch + * nothing -- the per-MAC soft reset in amd_eth_init is enough. + */ +#include "gem_port.h" + +void gem_soc_pre_init(void) +{ + /* No SoC quirk needed before MAC config. */ +} + +void gem_set_ref_clk(int speed_mbps) +{ + (void)speed_mbps; /* clock owned by the PLM; nothing to do */ +} + +void gem_clk_reset(void) +{ + /* GEM0 is already out of reset and clocked by the PLM; do not poke + * the protected CRL registers (would stall the bus). */ +} diff --git a/src/port/amd/boards/versal/config.h b/src/port/amd/boards/versal/config.h new file mode 100644 index 00000000..fd8b8f29 --- /dev/null +++ b/src/port/amd/boards/versal/config.h @@ -0,0 +1,31 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx Versal Gen 1 / VMK180 (Cortex-A72 + * EL3 bare-metal). Shared AMD/Xilinx profile lives in common/wolfip_config.h. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +/* Per-board overrides (if any) go here, before the shared profile. */ + +#include "wolfip_config.h" + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/amd/boards/versal/jtag/boot.sh b/src/port/amd/boards/versal/jtag/boot.sh new file mode 100755 index 00000000..e094f13e --- /dev/null +++ b/src/port/amd/boards/versal/jtag/boot.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# +# Boot the wolfIP Versal (VMK180) app via JTAG. Assumes a hw_server +# reachable on localhost (the default when Vitis is local). +# +# Unlike the ZynqMP flow there is no psu_init.tcl and no objcopy step: +# the Versal PLM brings the platform up from a boot PDI (programmed over +# JTAG), and we load the app ELF directly with xsdb `dow`. +# +# IMPORTANT: set the VMK180 boot-mode switch SW1 to JTAG (mode pins +# 0000, all OFF) and power-cycle first. In a flash/SD boot mode the board +# boots Linux, whose macb driver owns GEM0 and runtime-suspends its +# clock -- our bare-metal driver then stalls on the GEM registers. +# +# Required env (no built-in defaults; set per-developer): +# XSDB - path to Vitis xsdb binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/xsdb) +# BOOT_PDI - path to a VMK180 boot PDI. The PLM in this PDI configures +# PMC/PSM/NoC/DDR/MIO/clocks. A prebuilt vmk180 PDI works. +# +# Optional env (sensible defaults): +# APP_ELF - default: ${PORT_DIR}/app.elf (build with LAYOUT=ddr so +# the app loads to DDR, which the PLM has trained) +# +# Usage (from the port directory): +# XSDB=/opt/Xilinx/2025.2/Vitis/bin/xsdb \ +# BOOT_PDI=/path/to/vmk180_boot.pdi \ +# ./jtag/boot.sh +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${XSDB:?XSDB is required (path to Vitis xsdb binary)}" +: "${BOOT_PDI:?BOOT_PDI is required (path to a VMK180 boot PDI)}" +APP_ELF="${APP_ELF:-${PORT_DIR}/app.elf}" + +if ! command -v "${XSDB}" >/dev/null 2>&1 && [[ ! -x "${XSDB}" ]]; then + echo "ERROR: xsdb not found / not executable: ${XSDB}" >&2 + exit 1 +fi +if [[ ! -f "${BOOT_PDI}" ]]; then + echo "ERROR: BOOT_PDI not found at ${BOOT_PDI}" >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: app.elf not found at ${APP_ELF}. Run 'make LAYOUT=ddr' first." >&2 + exit 1 +fi + +echo "JTAG boot Versal (VMK180) wolfIP app" +echo " xsdb : ${XSDB}" +echo " boot pdi : ${BOOT_PDI}" +echo " app.elf : ${APP_ELF}" +echo + +export APP_ELF BOOT_PDI + +"${XSDB}" "${SCRIPT_DIR}/boot.tcl" + +echo +echo "App is running. Watch UART (PS console is FT4232 interface 1):" +echo " uart-monitor status | jq -r '.ports[].label' | grep VERSAL" +echo " uart-monitor tail VERSAL_VMK180_UART1" diff --git a/src/port/amd/boards/versal/jtag/boot.tcl b/src/port/amd/boards/versal/jtag/boot.tcl new file mode 100644 index 00000000..fa6170c7 --- /dev/null +++ b/src/port/amd/boards/versal/jtag/boot.tcl @@ -0,0 +1,72 @@ +# JTAG load of the wolfIP A72-0 bare-metal app on Versal (VMK180). +# +# Versal differs from ZynqMP: there is no psu_init.tcl. Platform bring-up +# (PMC, PSM, NoC, DDR, MIO, clocks) is performed by the PLM, which runs +# from a boot PDI. The board must be in JTAG boot mode (SW1 mode pins = +# 0000) so the BootROM waits and does NOT auto-boot Linux from SD/QSPI; +# otherwise Linux comes up and owns GEM0 (runtime-suspends its clock), +# which stalls our bare-metal driver. +# +# Flow: +# 1. rst -system -- clean POR; in JTAG mode the A72 stays +# held in reset (no Linux). +# 2. device program -- PLM configures DDR/clocks/MIO and +# de-isolates the FPD; this is the Versal +# equivalent of the ZynqMP psu_init step. +# 3. rst -proc A72#0 -- a Versal A72 resets to EL3 (RVBAR), the +# (-skip-activate- exception level startup.S expects. The +# subsystem) -skip-activate-subsystem flag avoids the +# heavy default-subsystem reset that drops +# the board USB-UART. +# 4. dow app.elf + con -- DDR is PLM-trained so `dow` is reliable. +# +# Env vars (set by jtag/boot.sh): +# APP_ELF path to app.elf (build with LAYOUT=ddr) +# BOOT_PDI path to a VMK180 boot PDI (PLM + platform config) + +set APP_ELF $env(APP_ELF) +set BOOT_PDI $env(BOOT_PDI) + +puts "Connecting..." +connect + +puts "JTAG chain:" +jtag targets + +# 1. Clean POR. In JTAG boot mode this leaves the A72 held in reset. +puts "rst -system (clean POR; JTAG mode -> no Linux)..." +targets -set -nocase -filter {name =~ "Versal *"} +rst -system +after 3000 + +# 2. Program the boot PDI through the PMC. With more than one device on +# the chain (e.g. a ZCU102 on a second cable) the PMC must be selected +# explicitly or `device program` reports an ambiguous device. +puts "device program (PLM brings up DDR/clocks/MIO): $BOOT_PDI" +targets -set -nocase -filter {name =~ "PMC"} +device program $BOOT_PDI +after 2500 + +# 3. Take over A72 #0 at EL3, before u-boot (if present in the PDI) +# autoboots anything. +puts "Preparing Cortex-A72 #0..." +targets -set -nocase -filter {name =~ "*Cortex-A72*#0"} +rst -proc -skip-activate-subsystem +after 400 +catch {stop} +after 200 +puts "PC after rst -proc (RVBAR, EL3): [rrd pc]" + +# 4. Load and run. +puts "Loading app: $APP_ELF" +dow $APP_ELF +after 200 +puts "PC after dow (app entry): [rrd pc]" + +puts "" +puts "con..." +con + +puts "Detaching, leaving app running." +disconnect +exit diff --git a/src/port/amd/boards/versal/target.ld b/src/port/amd/boards/versal/target.ld new file mode 100644 index 00000000..f3ae1b87 --- /dev/null +++ b/src/port/amd/boards/versal/target.ld @@ -0,0 +1,130 @@ +/* Versal Gen 1 (VMK180, Cortex-A72) Linker Script + * + * Memory map (current OCM-only layout): + * OCM : 256 KB @ 0xFFFC0000 (everything lives here) + * DDR low : 2 GB @ 0x00000000 (initialized by FSBL, currently unused + * by this app; reserved for future + * heap or larger ring buffers) + * + * App layout in OCM: + * 0xFFFC0000 .vectors (2 KB-aligned) + * ... .text, .rodata, .data, .bss, .page_tables, + * .dma_buffers (linker packs them in order) + * 0x100000000 _stack_top (top of OCM, stack grows down) + * + * Why OCM-only: + * - JTAG iteration: psu_init alone (no PMU FW) doesn't reliably + * bring up DDR for mwr-force loads. OCM is independent of the + * DDR controller and always works. + * - SD boot: bootgen will emit a warning about OCM overlap with + * FSBL, but FSBL's jump-to-image happens after partition load, + * so the overlay is safe. + * - The 16-KB JTAG DAP alias bug at the low DDR window is avoided + * entirely. + * + * .dma_buffers stays inside OCM (Normal-WB per L2_PERIPH[511]); GEM + * DMA coherency is handled with explicit DC CVAC / IVAC ops in gem.c + * rather than via an MMU attribute carve-out. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +/* Single-region layout: everything in OCM. DDR and DMA region + * definitions are kept as placeholders for a future layout that + * spills .dma_buffers (and possibly .bss) into DDR once the JTAG + * iteration path supports it. They are not referenced by any SECTION + * in the current layout. */ +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x001F0000 + DMA (rw) : ORIGIN = 0x00200000, LENGTH = 0x00200000 +} + +/* Stack at top of OCM (we keep stack in OCM with the rest since + * DDR-via-JTAG is unreliable without PMU FW). The 24 KB free area + * above DMA buffers gives plenty of stack room for our app. */ +_stack_top = 0x100000000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > OCM :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > OCM :text + + /* BSS in OCM as well - DDR-via-JTAG isn't reliable without PMU + * FW, so we keep all writeable state in OCM (256 KB total). */ + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > OCM :text + + /* Page tables in OCM so MMU walker isn't dependent on DDR being + * fully up (DDR-via-JTAG is unreliable without PMU FW; CPU + * fetch from OCM is bulletproof). 12 KB total (3x4KB tables). */ + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > OCM :text + + /* DMA buffers also in OCM - OCM is accessible to all AXI masters + * including the GEM DMA. With everything in OCM there's no DDR + * dependency for the basic bring-up. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/boards/versal/target_ddr.ld b/src/port/amd/boards/versal/target_ddr.ld new file mode 100644 index 00000000..2ee3824e --- /dev/null +++ b/src/port/amd/boards/versal/target_ddr.ld @@ -0,0 +1,122 @@ +/* Versal Gen 1 (VMK180, Cortex-A72) Linker Script - DDR layout + * + * Used when the app is loaded by wolfBoot (or any loader that places + * the signed image into DDR at a known LOAD_ADDRESS). FSBL + PMU FW + + * BL31 are all running by the time control reaches us, so the DDR + * controller is fully initialised and the DDR DAP 16-KB alias bug is + * a non-issue (the loader writes via the AXI master path). + * + * Memory map: + * DDR : 0x10000000 .. 0x10FFFFFF (16 MB; matches WOLFBOOT_LOAD_ADDRESS + * in wolfBoot's config/examples/zynqmp.config) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, still mapped Normal-WB + * executable by L2_PERIPH[511]; unused + * for this layout but left in MEMORY + * so MMU page-table addresses inside + * mmu.c remain valid) + * + * App layout in DDR (16 MB region @ 0x10000000): + * .vectors / .text / .rodata / .data / .bss / .page_tables / .dma_buffers + * stack grows down from _stack_top at the top of the region + * + * Stack top is set near the end of the DDR region with plenty of head + * room (4 MB) below for .bss + page tables + DMA buffers. Increase + * the LENGTH below if a larger heap or more DMA buffers are needed. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +MEMORY +{ + DDR (rwx) : ORIGIN = 0x10000000, LENGTH = 0x01000000 /* 16 MB */ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 /* still mapped */ +} + +/* Stack near the top of the DDR region. 16 MB - 4 KB gives the stack + * a safe red zone. */ +_stack_top = 0x10FFF000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + /* First 4 bytes of the image must be a `b _start` so wolfBoot's + * do_boot() (which branches to LOAD_ADDRESS, not the ELF entry) + * lands on a valid instruction. */ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > DDR :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > DDR :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > DDR :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > DDR :text + + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > DDR :text + + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > DDR :text + + /* DMA buffers get their own 2 MB-aligned block so the MMU can map + * just this range Normal-NC without making the .text block (which + * the CPU executes) non-cacheable or non-executable. NC is required + * for the GEM rings: the 8-byte BDs share 64-byte cache lines, so a + * cacheable ring with per-BD cache_clean writes stale neighbour BDs + * back over MAC-set OWN bits and wedges RX under sustained (TCP-rate) + * load. See mmu_aarch64.c (Skoll HIGH-2). */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(0x200000); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > DDR :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/boards/zcu102/.gitignore b/src/port/amd/boards/zcu102/.gitignore new file mode 100644 index 00000000..c6cf5083 --- /dev/null +++ b/src/port/amd/boards/zcu102/.gitignore @@ -0,0 +1,5 @@ +*.o +*.elf +*.bin +BOOT.BIN +.layout_stamp diff --git a/src/port/amd/boards/zcu102/Makefile b/src/port/amd/boards/zcu102/Makefile new file mode 100644 index 00000000..9e0953aa --- /dev/null +++ b/src/port/amd/boards/zcu102/Makefile @@ -0,0 +1,145 @@ +# Xilinx ZCU102 (UltraScale+ MPSoC, Cortex-A53) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=aarch64-none-elf- +# Bootbin: FSBL_ELF=/path/to/fsbl.elf make bootbin +# +# Toolchain: ARM GNU aarch64-none-elf-gcc (tested with 14.3.rel1). + +CROSS_COMPILE ?= aarch64-none-elf- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../../../.. +AMD := $(ROOT)/src/port/amd +COMMON := $(AMD)/common +IP := $(AMD)/ip +ARCH := $(AMD)/arch/aarch64 + +# Cortex-A53, AArch64, EL3 single-EL bare-metal. No SIMD/FP in the +# wolfIP/driver paths - keep -mgeneral-regs-only to catch any +# accidental FP use and make the ABI deterministic for cert. +CFLAGS := -mcpu=cortex-a53 -mgeneral-regs-only +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(COMMON) -I$(ARCH) -I$(IP) -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DZCU102 -DXILINX_AARCH64 +# Append extra defines for investigation builds, e.g.: +# make CFLAGS_EXTRA="-DDEBUG_GEM -DDEBUG_PHY" +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a53 -DUART_EARLY_TX_OFF=0x30 + +# Layout selector. Default ocm keeps the OCM-only layout that the JTAG +# iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass +# LAYOUT=ddr to relink for DDR @ 0x10000000 -- this is the layout +# wolfBoot expects (WOLFBOOT_LOAD_ADDRESS in zynqmp.config). +LAYOUT ?= ocm +ifeq ($(LAYOUT),ddr) + LDSCRIPT := target_ddr.ld + CFLAGS += -DAMD_LAYOUT_DDR +else ifeq ($(LAYOUT),ocm) + LDSCRIPT := target.ld + CFLAGS += -DAMD_LAYOUT_OCM +else + $(error LAYOUT must be 'ocm' or 'ddr') +endif + +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Replace newlib's aarch64 memset/memcpy (which use 'dc zva' and hang +# on this Cortex-A53 setup) with our bytewise versions in main.c. +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := app.c board.c uart_cadence.c uart_util.c mmu_aarch64.c gic_gicv2.c \ + gem_core.c board_gem.c gem_swq.c gem_rx_swq_poll.c \ + phy_dp83867.c phy_dispatch_dp83867.c entropy.c exception_aarch64.c +LOCAL_S := startup_aarch64.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +# Compile wolfIP core into our directory (don't reuse the upstream .o, +# which may have been built for a different ABI). +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +# Shared sources live outside this board dir; find them by vpath so the +# .o files still land here (keeps clean + JTAG app.elf-in-place working). +vpath %.c $(COMMON):$(ARCH):$(IP) +vpath %.S $(ARCH) + +# Keep 'all' the default goal even though the layout-stamp rules below +# are defined before it. +.DEFAULT_GOAL := all + +# A change in LAYOUT must force a full rebuild. OCM objects link against +# 0xFFFC0000 and a DDR build against 0x10000000; reusing stale objects +# across a layout switch silently produces a wrong image. In particular a +# BOOT.BIN built from OCM-layout objects collides with the FSBL (which +# also lives in OCM) and never hands off. The stamp records the last +# LAYOUT; its mtime only bumps when LAYOUT actually changes, so +# same-layout incremental builds are unaffected. +LAYOUT_STAMP := .layout_stamp +.PHONY: FORCE +FORCE: +$(LAYOUT_STAMP): FORCE + @if [ "`cat $@ 2>/dev/null`" != "$(LAYOUT)" ]; then \ + echo "LAYOUT -> $(LAYOUT) (was `cat $@ 2>/dev/null`); forcing rebuild"; \ + echo "$(LAYOUT)" > $@; \ + fi +$(OBJS): $(LAYOUT_STAMP) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +# wolfIP core: -Wno-zero-length-bounds is needed because wolfIP sizes +# its timer heap as MAX_TIMERS = MAX_TCPSOCKETS * 3. With our minimum +# of MAX_TCPSOCKETS=2 (forced by DHCP/ARP timer scheduling, see +# README), the heap is 6 entries which is fine. The warning fires +# anyway on the zero-length-array code path that wolfIP includes for +# the MAX_TCPSOCKETS=0 profile we'd actually like to use; the [0] +# accesses are runtime-guarded by heap->size > 0 so this is a false +# positive. Drop the suppression once core decouples the timer count +# from MAX_TCPSOCKETS. +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +# Build a bootable BOOT.BIN. Requires FSBL_ELF env var pointing to a +# pre-built ZCU102 FSBL (built in Vitis or PetaLinux). bootgen itself +# is part of Vitis or available standalone. +# +# BOOT.BIN must use the DDR layout: the OCM layout links at 0xFFFC0000, +# exactly where the FSBL runs from, so an OCM-layout BOOT.BIN clobbers +# the FSBL and never reaches the app. Force a DDR (re)build here +# regardless of any LAYOUT passed on the command line. +bootbin: + @if [ -z "$$FSBL_ELF" ]; then \ + echo "ERROR: FSBL_ELF must point to a prebuilt ZCU102 FSBL ELF."; \ + exit 1; \ + fi + $(MAKE) LAYOUT=ddr app.elf + FSBL_ELF=$$FSBL_ELF APP_ELF=$$PWD/app.elf bootgen/build_bootbin.sh + +clean: + rm -f $(OBJS) app.elf BOOT.BIN $(LAYOUT_STAMP) + +.PHONY: all clean bootbin help + +help: + @echo "ZCU102 wolfIP build:" + @echo " make - build app.elf" + @echo " FSBL_ELF=... make bootbin - build BOOT.BIN" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/amd/boards/zcu102/README.md b/src/port/amd/boards/zcu102/README.md new file mode 100644 index 00000000..29f0288d --- /dev/null +++ b/src/port/amd/boards/zcu102/README.md @@ -0,0 +1,210 @@ +# wolfIP port: Xilinx ZCU102 (UltraScale+ MPSoC) + +Bare-metal wolfIP port for the AMD/Xilinx Zynq UltraScale+ MPSoC, demoed +on the ZCU102 dev board. Targets a single Cortex-A53 core (APU 0) at +EL3, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS, no wolfBoot. + +This first milestone is aimed at a deterministic UDP-only profile +suitable for DO-178C DAL-C qualification. The application opens a +UDP echo socket on port 7 and runs a DHCP client to acquire a lease. + +## What this port covers + +- PS-GEM3 (on-board RJ45) at 1 Gbps via the TI DP83867IR PHY (RGMII). +- Poll-driven RX and TX (`gem_isr()` is called from the main loop via + `gem_rx_swq_poll`, the same model as the Versal/ZC702 ports). The GEM + RX interrupt is left unarmed: an enabled RX-complete interrupt storms + the CPU under sustained TCP-rate RX and wedges the stack. The GICv2 + + `SCR_EL3.IRQ=1` IRQ plumbing in `startup_aarch64.S` / `gic_gicv2.c` + remains in place but dormant. +- Clean-room Cadence GEM driver - no XEmacPs, no Xilinx Standalone BSP, + no `xparameters.h`. All register base addresses live in `board.h`. +- MMU at EL3 with a static page table: DDR Normal WB, peripherals + Device-nGnRnE, and an OCM (0xFFFC0000+) Normal-WB executable block + where this app currently lives (text, data, BSS, page tables, and + the GEM BDs/frame buffers all in OCM). GEM DMA coherency is handled + with explicit DC CVAC / IVAC ops in `gem_core.c`. A Normal-NC DMA + carve-out is reserved in the L2_DDR table for a future layout that + spills `.dma_buffers` into DDR but is dormant today. +- PS-UART0 polled console (USB-UART on the ZCU102 board, channel 0). +- DHCP client and a UDP echo demo (port 7); ICMP echo reply works + through the wolfIP core. + +## What is explicitly NOT in this port yet + +- Software VLAN (Daniele has a separate wolfIP-core PR in flight). +- uC/OS-II socket port (planned follow-up; trivially adapts an existing + `bsd_socket.c`). +- Additional GEM instances (GEM0/1/2). Driver is single-instance. +- Versal Gen 1, Zynq-7000. +- wolfBoot integration. Stock Xilinx FSBL hands control directly to + `app.elf`. +- TLS / wolfSSL. + +## Hardware + +- AMD/Xilinx ZCU102 evaluation board (XCZU9EG-2FFVB1156). Rev 1.0 or + 1.1 are both fine. +- USB-UART via the on-board FTDI FT4232 (host sees four `/dev/ttyUSB*` + channels; UART0 is the standard one, typically `/dev/ttyUSB0` or the + channel labelled "MIO" depending on board / udev). +- Ethernet via the on-board RJ45 (PS-GEM3 -> DP83867 PHY @ MDIO 0x0C). + +## Build + +Toolchain: ARM GNU `aarch64-none-elf-gcc`. The default is on `$PATH`; +override with `CROSS_COMPILE=...-` if needed. + +``` +cd src/port/amd/boards/zcu102 +make CROSS_COMPILE=aarch64-none-elf- +``` + +Output: `app.elf`. Section sizes are printed at the end of the build. + +## Build BOOT.BIN + +You need a pre-built ZCU102 FSBL ELF. The simplest way to obtain one +is the Vitis "zynqmp_fsbl" template (single-click build), or PetaLinux +`petalinux-build -c bootloader`. We deliberately do NOT vendor FSBL +sources here; FSBL is a Xilinx-provided component and stock works. + +Source Vitis first (so `bootgen` is on `$PATH`), then: + +``` +FSBL_ELF=/path/to/zynqmp_fsbl.elf make bootbin +``` + +Output: `BOOT.BIN` in the port directory. + +The `bootbin` target always builds the app with the **DDR layout** +(`LAYOUT=ddr`, app at `0x10000000`) regardless of any `LAYOUT=` on the +command line, and forces a rebuild if the previous build used a +different layout. This is deliberate: the OCM layout links the app at +`0xFFFC0000`, which is exactly where the FSBL runs from, so an +OCM-layout `BOOT.BIN` would clobber the FSBL and never reach the app. +The FSBL initialises DDR, loads `app.elf` to `0x10000000`, and hands off +at EL3. + +## Boot + +### SD card boot + +1. Format a microSD as FAT32. +2. Copy `BOOT.BIN` to the root of the SD card. +3. Set ZCU102 boot mode DIP SW6 to SD (positions 1-4 = ON, OFF, OFF, OFF). +4. Insert the card and power-cycle the board. + +### JTAG boot (Vitis xsct) + +``` +xsct +% connect +% targets -set -filter {name =~ "PSU"} +% rst -system +% loadhw -hw /path/to/your-design.xsa +% targets -set -filter {name =~ "Cortex-A53 #0"} +% dow /path/to/wolfip/src/port/amd/boards/zcu102/app.elf +% con +``` + +If you do not have an XSA from your own design, the stock ZCU102 base +design from Vitis is fine - we only depend on the PS configuration +(DDR controller, MIO pinmuxing, IOPLL clocks) which is identical +across base designs. + +### JTAG iteration (no SD swap) + +This port ships a self-contained xsdb loader under `jtag/` that +power-cycles the board (via remote Pi GPIO, optional), forces JTAG +boot mode, runs `psu_init`, loads `app.elf` into OCM, and releases +A53-0 at the OCM entry. The whole app + BSS + page tables + DMA +buffers fit in the 256 KB OCM, so DDR-via-JTAG flakiness is avoided. + +``` +./jtag/boot.sh # one-shot +./jtag/boot_iter.sh # build + power-cycle + load loop +``` + +See `jtag/boot.tcl` for the actual xsdb sequence. + +## Expected UART output + +``` +=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) === +MMU on, caches on. Bringing up GIC-400... +Initializing wolfIP stack... +Bringing up GEM3 (RGMII, DP83867)... +GEM3: PHY at MDIO addr=0x0000000C +DP83867: ID1=0x00002000 ID2=0x0000A231 +DP83867 link: 1000 Mbps FD + link UP, PHY=0x0000000C +Starting DHCP client... +DHCP bound: + IP: 192.168.1.50 + Mask: 255.255.255.0 + GW: 192.168.1.1 +Opening UDP echo socket on port 7 +Ready. Try: nc -u 7 +``` + +## Verification + +From a host on the same subnet as the board: + +``` +$ ping -c 3 192.168.1.50 +$ echo "hello wolfip" | nc -u -w1 192.168.1.50 7 +hello wolfip +``` + +UART capture via the `uart-monitor` skill (add a board entry pointing +at `/dev/ttyUSB0` and 115200 8N1). + +## Files + +| File | Purpose | +|---------------------|---------| +| `Makefile` | Build app.elf and BOOT.BIN | +| `target.ld` | aarch64 EL3 linker script - separate RX/RW segments, 2 MB DMA region | +| `startup_aarch64.S` | EL3 vectors, BSS clear, MMU/main bring-up, IRQ trampoline | +| `board.h` | PS register base addresses, GIC SPI IDs | +| `mmu_aarch64.c` / `.h` | EL3 page tables (T0SZ=32, 1 GB L1 + 2 MB L2 for DDR + DMA carve-out) | +| `gic_gicv2.c` / `.h` | GIC-400 (GICv2) minimal driver | +| `uart_cadence.c` / `.h` | PS-UART0 polled console | +| `gem_core.c` / `.h` | Cadence GEM driver (PS-GEM3): BDs, polled-RX/TX, MDIO, cache maintenance | +| `phy_dp83867.c` / `.h` | TI DP83867IR init + RGMII skew + AN + RX_CTRL strap quirk | +| `app.c` | wolfIP init, DHCP client, UDP echo on port 7, memset/memcpy wrappers | +| `config.h` | wolfIP build profile (UDP-only intent) | +| `bootgen/boot.bif` | bootgen template (substitutes `${FSBL_ELF}` and `${APP_ELF}`) | +| `bootgen/build_bootbin.sh` | renders the bif and invokes bootgen | +| `jtag/boot.sh` / `.tcl` | xsdb loader for OCM-only JTAG iteration | + +## Notes for cert / DAL-C + +- No Xilinx Standalone BSP linked in. `aarch64-none-elf-gcc` newlib + provides `memcpy`/`memset` only. +- No dynamic allocation. All buffers static in BSS or `.dma_buffers`. +- No floating point (`-mgeneral-regs-only`). +- The MAC address is hard-coded in `board.h`. Replace with a + per-board value (e.g., read from EEPROM or PS_VERSION fuses) for + production; we keep static for repeatability in the lab. +- The wolfIP core currently sizes its timer heap as + `MAX_TIMERS = MAX_TCPSOCKETS * 3`. This port sets `MAX_TCPSOCKETS=2` + in `config.h` so DHCP / ARP can schedule timers; the application + does not open any TCP sockets. A core wolfIP follow-up should + decouple the timer count from TCP so the TCP code can be fully + excluded from a DAL-C build. +- The wolfIP core triggers two false-positive GCC warnings + (`-Wzero-length-bounds`, `-Wtype-limits`) when `MAX_TCPSOCKETS` + reaches its lower bound. We suppress them on the wolfip.c compile + only; the diagnostics on this port's source remain at `-Wall -Wextra + -Werror`. +- newlib's aarch64 `memset`/`memcpy` use `dc zva`, which hangs on this + Cortex-A53 setup even with `SCTLR_EL3.DZE=1`. We override both with + bytewise versions in `app.c` via `-Wl,--wrap`. + +## Known issues + +- `MAX_TCPSOCKETS=2` is the minimum for the current wolfIP core - see + the timer-heap note above. diff --git a/src/port/amd/boards/zcu102/board.c b/src/port/amd/boards/zcu102/board.c new file mode 100644 index 00000000..b1f50b16 --- /dev/null +++ b/src/port/amd/boards/zcu102/board.c @@ -0,0 +1,26 @@ +/* board.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * ZCU102 board hooks for the shared demo (app.c). + */ +#include "app.h" +#include "gic.h" + +const char *board_banner(void) +{ + return "\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) ===\n" + "MMU on, caches on. Bringing up GIC-400 (GICv2)...\n"; +} + +void board_irq_setup(void) +{ + /* RX is poll-driven (gem_rx_swq_poll: gem_isr() is called from the main + * loop and gem_rx_install() is a no-op, so no GEM SPI is armed). The + * IRQ-driven RX model stormed the CPU under sustained TCP-rate RX and + * wedged the stack, so this board now uses the same poll model as the + * other two. Unmasking CPU IRQs here is harmless (no source enabled). */ + irq_enable(); +} diff --git a/src/port/amd/boards/zcu102/board.h b/src/port/amd/boards/zcu102/board.h new file mode 100644 index 00000000..3daafcbf --- /dev/null +++ b/src/port/amd/boards/zcu102/board.h @@ -0,0 +1,115 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx UltraScale+ MPSoC PS register base addresses, GIC SPI IDs, + * and clock parents for the ZCU102 board. All values are derived from + * the ZynqMP TRM (UG1085) and the ZCU102 board user guide (UG1182). + * No Xilinx BSP header (xparameters.h) is required. + */ +#ifndef ZCU102_BOARD_H +#define ZCU102_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (ZynqMP PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x80000000UL /* 2 GB lower bank */ + +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL /* 256 KB */ + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xFF000000UL +#define UART1_BASE 0xFF010000UL + +/* Console UART: ZCU102 routes UART0 to the on-board FTDI. The FSBL ref + * clock is the known 100 MHz (IOPLL/15), so we program the divisors. */ +#define UART_BASE UART0_BASE +#define UART_PROGRAM_BAUD +#define UART_BAUDGEN_CD 124 +#define UART_BAUDDIV_BDIV 6 + +#define GEM0_BASE 0xFF0B0000UL +#define GEM1_BASE 0xFF0C0000UL +#define GEM2_BASE 0xFF0D0000UL +#define GEM3_BASE 0xFF0E0000UL + +/* On-board RJ45 is GEM3 on the ZCU102. */ +#define GEM_BASE GEM3_BASE +#define IRQ_GEM IRQ_GEM3 + +#define CRL_APB_BASE 0xFF5E0000UL +#define IOU_SLCR_BASE 0xFF180000UL + +/* GIC-400 distributor and CPU interface (per ZynqMP TRM). */ +#define GICD_BASE 0xF9010000UL +#define GICC_BASE 0xF9020000UL + +/* --------------------------------------------------------------------- + * GIC SPI numbers as GIC INTIDs (ARM GIC numbering: SPI N -> INTID 32+N). + * The ZynqMP TRM Table 13-1 column "SPI ID" is the GIC_SPI offset (0..) + * used in Linux device trees; the actual GIC INTID is 32 + that offset. + * We use INTIDs directly throughout this driver, so add 32. + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 (32 + 57) /* GIC_SPI 57 -> INTID 89 */ +#define IRQ_GEM1 (32 + 59) /* GIC_SPI 59 -> INTID 91 */ +#define IRQ_GEM2 (32 + 61) /* GIC_SPI 61 -> INTID 93 */ +#define IRQ_GEM3 (32 + 63) /* GIC_SPI 63 -> INTID 95 + * on-board ZCU102 RJ45 */ + +/* --------------------------------------------------------------------- + * CRL_APB clock and reset registers + * ------------------------------------------------------------------- */ +#define CRL_APB_GEM3_REF_CTRL (CRL_APB_BASE + 0x5C) +#define CRL_APB_RST_LPD_IOU0 (CRL_APB_BASE + 0x230) /* GEM3 reset bit 3 */ + +/* --------------------------------------------------------------------- + * PS UART0 (Cadence) - on-board USB-UART on ZCU102 via U104 FT4232 + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* MAC address for eth0. Locally-administered, even first octet: + * 02:00:5A:11:22:33. Each byte is individually overridable via + * build-time -DWOLFIP_MAC_n=0xXX so callers can swap any subset + * (e.g. only the last three bytes from an EEPROM-derived value). */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* ZCU102_BOARD_H */ diff --git a/src/port/amd/boards/zcu102/board_gem.c b/src/port/amd/boards/zcu102/board_gem.c new file mode 100644 index 00000000..13f18d08 --- /dev/null +++ b/src/port/amd/boards/zcu102/board_gem.c @@ -0,0 +1,61 @@ +/* board_gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * ZCU102 (ZynqMP) GEM clock/reset hooks for the shared GEM core. The GEM3 + * reference clock and reset live in CRL_APB, which bare-metal may poke on + * ZynqMP. + */ +#include +#include "board.h" +#include "gem_port.h" +#include "timer.h" /* delay_us / delay_ms - deterministic, counter-backed */ + +#define CRL_RST_GEM3 (1u << 3) /* GEM3 reset bit in RST_LPD_IOU0 */ + +void gem_soc_pre_init(void) +{ + /* No SoC quirk needed before MAC config on ZynqMP. */ +} + +/* Configure CRL_APB.GEM3_REF_CTRL for the negotiated link speed. The MAC + * sources TX_CLK to the PHY at this rate (RGMII): 125/25/2.5 MHz for + * 1G/100M/10M. IOPLL = 1500 MHz, /12 base. Register layout (TRM): + * CLKACT bit26, CLKACT_RX bit25, DIVISOR1 [21:16], DIVISOR0 [13:8], + * SRCSEL [2:0]. */ +void gem_set_ref_clk(int speed_mbps) +{ + volatile uint32_t *gem3_ref = (volatile uint32_t *)CRL_APB_GEM3_REF_CTRL; + uint32_t div1; + uint32_t val; + + switch (speed_mbps) { + case 1000: div1 = 1; break; + case 100: div1 = 5; break; + case 10: div1 = 50; break; + default: div1 = 1; break; + } + val = (1u << 26) /* CLKACT */ + | (1u << 25) /* CLKACT_RX */ + | ((div1 & 0x3Fu) << 16) /* DIVISOR1 */ + | ((12u & 0x3Fu) << 8) /* DIVISOR0 */ + | (0u); /* SRCSEL = IOPLL */ + *gem3_ref = val; +} + +/* Pulse the GEM3 reset bit so the MAC starts from a known state, then + * force the 125 MHz reference (amd_eth_init downshifts later if the PHY + * negotiates 100/10). */ +void gem_clk_reset(void) +{ + volatile uint32_t *rst = (volatile uint32_t *)CRL_APB_RST_LPD_IOU0; + + *rst |= CRL_RST_GEM3; + delay_us(10); /* hold the reset asserted */ + *rst &= ~CRL_RST_GEM3; + delay_ms(10); /* settle after deassert (counter-backed) */ + + gem_set_ref_clk(1000); +} diff --git a/src/port/amd/boards/zcu102/bootgen/boot.bif b/src/port/amd/boards/zcu102/bootgen/boot.bif new file mode 100644 index 00000000..f3370894 --- /dev/null +++ b/src/port/amd/boards/zcu102/bootgen/boot.bif @@ -0,0 +1,14 @@ +// ZCU102 BOOT.BIN definition for wolfIP bare-metal app. +// +// Variables expanded by build_bootbin.sh: +// ${FSBL_ELF} - path to a pre-built ZynqMP FSBL (A53-0, EL3, NS) +// ${APP_ELF} - path to the wolfIP app ELF (this directory's app.elf) +// +// bootgen consumes this file with: +// bootgen -arch zynqmp -image boot.bif -w on -o BOOT.BIN + +the_ROM_image: +{ + [bootloader, destination_cpu=a53-0] ${FSBL_ELF} + [destination_cpu=a53-0, exception_level=el-3] ${APP_ELF} +} diff --git a/src/port/amd/boards/zcu102/bootgen/build_bootbin.sh b/src/port/amd/boards/zcu102/bootgen/build_bootbin.sh new file mode 100755 index 00000000..0069760f --- /dev/null +++ b/src/port/amd/boards/zcu102/bootgen/build_bootbin.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# +# Build BOOT.BIN for the wolfIP ZCU102 bare-metal app. +# +# Required env vars: +# FSBL_ELF - path to a prebuilt ZynqMP FSBL ELF (A53-0, EL3, NS). +# Build this once in Vitis (helloworld template -> zynqmp_fsbl) +# or in PetaLinux; we do not vendor FSBL sources here. +# APP_ELF - path to the wolfIP app ELF. The Makefile's "bootbin" +# target sets this for you to $PWD/app.elf. +# +# Optional: +# BOOTGEN - path to the bootgen binary (default: from $PATH). +# OUT_DIR - where to place BOOT.BIN (default: parent of this script). +# +set -euo pipefail + +if [[ -z "${FSBL_ELF:-}" ]]; then + echo "ERROR: FSBL_ELF env var must point to a ZynqMP FSBL ELF." >&2 + exit 1 +fi +if [[ -z "${APP_ELF:-}" ]]; then + echo "ERROR: APP_ELF env var must point to the wolfIP app ELF." >&2 + exit 1 +fi +if [[ ! -f "${FSBL_ELF}" ]]; then + echo "ERROR: FSBL_ELF '${FSBL_ELF}' not found." >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: APP_ELF '${APP_ELF}' not found." >&2 + exit 1 +fi + +BOOTGEN="${BOOTGEN:-bootgen}" +if ! command -v "${BOOTGEN}" >/dev/null 2>&1; then + echo "ERROR: bootgen not found. Source Vitis (settings64.sh) first." >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUT_DIR="${OUT_DIR:-$(dirname "${SCRIPT_DIR}")}" +BIF_TEMPLATE="${SCRIPT_DIR}/boot.bif" +BIF_RENDERED="$(mktemp -t wolfip-zcu102-bif.XXXXXX)" +trap 'rm -f "${BIF_RENDERED}"' EXIT + +# Substitute ${FSBL_ELF} and ${APP_ELF} in the bif template. +sed \ + -e "s|\${FSBL_ELF}|${FSBL_ELF}|g" \ + -e "s|\${APP_ELF}|${APP_ELF}|g" \ + "${BIF_TEMPLATE}" > "${BIF_RENDERED}" + +cd "${OUT_DIR}" +"${BOOTGEN}" -arch zynqmp -image "${BIF_RENDERED}" -w on -o BOOT.BIN + +echo "BOOT.BIN written to: ${OUT_DIR}/BOOT.BIN" diff --git a/src/port/amd/boards/zcu102/config.h b/src/port/amd/boards/zcu102/config.h new file mode 100644 index 00000000..f53755de --- /dev/null +++ b/src/port/amd/boards/zcu102/config.h @@ -0,0 +1,31 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx ZCU102 (UltraScale+ MPSoC, A53-0 EL3 + * bare-metal). Shared AMD/Xilinx profile lives in common/wolfip_config.h. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +/* Per-board overrides (if any) go here, before the shared profile. */ + +#include "wolfip_config.h" + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/amd/boards/zcu102/flash_sd.sh b/src/port/amd/boards/zcu102/flash_sd.sh new file mode 100755 index 00000000..44f88f1b --- /dev/null +++ b/src/port/amd/boards/zcu102/flash_sd.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# flash_sd.sh - copy wolfIP ZCU102 BOOT.BIN to the SD card's boot partition. +# +# Usage: +# ./flash_sd.sh # uses /dev/sdb (default), src/port/zcu102/BOOT.BIN +# SD_DEV=/dev/sdc ./flash_sd.sh +# BOOTBIN=/path/to/BOOT.BIN ./flash_sd.sh +# +# Defensive: refuses to write to a device that is not flagged removable +# by the kernel, or any device larger than 128 GiB (so it cannot ever +# scribble on your system SSD by accident). +# +set -euo pipefail + +SD_DEV="${SD_DEV:-/dev/sdb}" +# mmcblk/nvme devices suffix partitions with 'p' (e.g. mmcblk0p1); sdX +# style devices just append the number (sdb1). +case "${SD_DEV}" in + *[0-9]) PART="${SD_DEV}p1" ;; + *) PART="${SD_DEV}1" ;; +esac +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BOOTBIN="${BOOTBIN:-${SCRIPT_DIR}/BOOT.BIN}" + +red() { printf '\033[1;31m%s\033[0m\n' "$*" >&2; } +green() { printf '\033[1;32m%s\033[0m\n' "$*"; } +note() { printf ' %s\n' "$*"; } + +# --- Sanity checks ------------------------------------------------------- + +if [[ ! -b "${SD_DEV}" ]]; then + red "ERROR: ${SD_DEV} is not a block device." + exit 1 +fi +if [[ ! -b "${PART}" ]]; then + red "ERROR: boot partition ${PART} not found." + red " Did you insert the card and pick the right SD_DEV?" + exit 1 +fi + +RM=$(lsblk -dn -o RM "${SD_DEV}" | tr -d '[:space:]') +if [[ "${RM}" != "1" ]]; then + red "ERROR: ${SD_DEV} is not marked removable (RM=${RM})." + red " Refusing to write - this looks like a fixed disk." + exit 1 +fi + +SIZE_BYTES=$(lsblk -dn -o SIZE -b "${SD_DEV}" | tr -d '[:space:]') +SIZE_GIB=$(( SIZE_BYTES / 1024 / 1024 / 1024 )) +if (( SIZE_GIB > 128 )); then + red "ERROR: ${SD_DEV} is ${SIZE_GIB} GiB - too large for an SD card." + red " Refusing to write." + exit 1 +fi + +if [[ ! -f "${BOOTBIN}" ]]; then + red "ERROR: ${BOOTBIN} not found. Did you run 'make bootbin'?" + exit 1 +fi + +note "SD device : ${SD_DEV} (${SIZE_GIB} GiB, removable)" +note "Boot partition: ${PART}" +note "Source : ${BOOTBIN}" +echo + +# --- Mount (idempotent) -------------------------------------------------- + +MNT=$(lsblk -no MOUNTPOINT "${PART}") +WE_MOUNTED=0 +if [[ -z "${MNT}" ]]; then + note "Mounting ${PART} via udisksctl..." + udisksctl mount -b "${PART}" >/dev/null + MNT=$(lsblk -no MOUNTPOINT "${PART}") + WE_MOUNTED=1 +fi +if [[ -z "${MNT}" ]]; then + red "ERROR: ${PART} did not mount." + exit 1 +fi +note "Mountpoint : ${MNT}" + +# Verify FAT - cheap heuristic: check filesystem type via lsblk. +FSTYPE=$(lsblk -no FSTYPE "${PART}") +if [[ "${FSTYPE}" != "vfat" && "${FSTYPE}" != "exfat" && "${FSTYPE}" != "msdos" ]]; then + red "WARN: ${PART} filesystem is '${FSTYPE}', expected vfat for ZCU102 SD boot." +fi + +# --- Backup and copy ----------------------------------------------------- + +if [[ -f "${MNT}/BOOT.BIN" ]]; then + OLD_SZ=$(stat -c%s "${MNT}/BOOT.BIN") + cp --preserve=timestamps "${MNT}/BOOT.BIN" "${MNT}/BOOT.BIN.bak" + note "Backed up existing BOOT.BIN (${OLD_SZ} bytes) -> BOOT.BIN.bak" +fi + +cp "${BOOTBIN}" "${MNT}/BOOT.BIN" +sync +NEW_SZ=$(stat -c%s "${MNT}/BOOT.BIN") +note "Wrote ${NEW_SZ} bytes to ${MNT}/BOOT.BIN" + +# --- Unmount ------------------------------------------------------------- + +if (( WE_MOUNTED == 1 )); then + note "Unmounting ${PART}..." + udisksctl unmount -b "${PART}" >/dev/null +fi +sync + +green "Done. Safe to remove the SD card and boot the board." +echo +note "Watch UART log: tail -f /tmp/uart-monitor/latest/ZYNQMP_ZCU102_UART0.log" +note "Or: uart-monitor tail ZYNQMP_ZCU102_UART0" diff --git a/src/port/amd/boards/zcu102/jtag/boot.sh b/src/port/amd/boards/zcu102/jtag/boot.sh new file mode 100755 index 00000000..fa05e6a8 --- /dev/null +++ b/src/port/amd/boards/zcu102/jtag/boot.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# +# Boot the wolfIP ZCU102 app via JTAG (Platform Cable II / Digilent). +# Sourced from anywhere; assumes a hw_server reachable on localhost +# (the default when Vitis is local). +# +# Required env (no built-in defaults; set per-developer): +# XSDB - path to Vitis xsdb binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/xsdb) +# PSU_INIT_TCL - path to a ZCU102 psu_init.tcl, generated by Vitis +# for a base design that matches your board (DDR, +# MIO pinmux, IOPLL clocks). The PetaLinux hw- +# description directory contains one. +# OBJCOPY - aarch64 objcopy binary, e.g. +# aarch64-none-elf-objcopy on PATH +# +# Optional env (sensible defaults): +# APP_ELF - default: ${PORT_DIR}/app.elf +# APP_BIN - default: ${PORT_DIR}/app.bin (objcopy'd here) +# APP_LOAD_ADDR- default: 0xFFFC0000 (OCM). Set to 0x10000000 for the +# LAYOUT=ddr build, which is also what wolfBoot uses. +# PMUFW_ELF - path to pmufw.elf. When set, jtag/boot.tcl loads it +# into the PMU MicroBlaze and starts it before +# psu_init. Required for reliable DDR access via +# JTAG; not needed for the OCM-only layout. +# +# Usage (from the port directory): +# XSDB=/opt/Xilinx/2025.2/Vitis/bin/xsdb \ +# PSU_INIT_TCL=/path/to/psu_init.tcl \ +# OBJCOPY=aarch64-none-elf-objcopy \ +# ./jtag/boot.sh +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${XSDB:?XSDB is required (path to Vitis xsdb binary)}" +: "${PSU_INIT_TCL:?PSU_INIT_TCL is required (path to a ZCU102 psu_init.tcl)}" +: "${OBJCOPY:?OBJCOPY is required (aarch64 objcopy binary on PATH or absolute path)}" +APP_ELF="${APP_ELF:-${PORT_DIR}/app.elf}" +APP_BIN="${APP_BIN:-${PORT_DIR}/app.bin}" + +if ! command -v "${XSDB}" >/dev/null 2>&1 && [[ ! -x "${XSDB}" ]]; then + echo "ERROR: xsdb not found / not executable: ${XSDB}" >&2 + exit 1 +fi +if [[ ! -f "${PSU_INIT_TCL}" ]]; then + echo "ERROR: psu_init.tcl not found at ${PSU_INIT_TCL}" >&2 + exit 1 +fi +if ! command -v "${OBJCOPY}" >/dev/null 2>&1 && [[ ! -x "${OBJCOPY}" ]]; then + echo "ERROR: objcopy not found / not executable: ${OBJCOPY}" >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: app.elf not found at ${APP_ELF}. Run 'make' first." >&2 + exit 1 +fi + +# Generate flat binary (PT_LOAD segments concatenated by physical addr). +echo "Generating app.bin from app.elf..." +"${OBJCOPY}" -O binary "${APP_ELF}" "${APP_BIN}" + +# If PMU FW is provided, generate its flat binary alongside. xsdb's +# `dow` fails on the PMU MicroBlaze target without a loaded XSA, so +# the TCL loads it via mwr-force from this .bin instead. PMU FW is +# MicroBlaze, so it needs a MicroBlaze-capable objcopy (override +# with MB_OBJCOPY=, defaults to the Vitis-shipped one if present). +if [[ -n "${PMUFW_ELF:-}" ]]; then + if [[ ! -f "${PMUFW_ELF}" ]]; then + echo "ERROR: PMUFW_ELF set but not found: ${PMUFW_ELF}" >&2 + exit 1 + fi + MB_OBJCOPY="${MB_OBJCOPY:-/opt/Xilinx/2025.2/gnu/microblaze/lin/bin/mb-objcopy}" + if ! command -v "${MB_OBJCOPY}" >/dev/null 2>&1 && [[ ! -x "${MB_OBJCOPY}" ]]; then + echo "ERROR: MicroBlaze objcopy not found at ${MB_OBJCOPY}" >&2 + echo " set MB_OBJCOPY=/path/to/mb-objcopy" >&2 + exit 1 + fi + PMUFW_BIN="${PMUFW_BIN:-${PMUFW_ELF%.elf}.bin}" + echo "Generating $(basename "${PMUFW_BIN}") from pmufw.elf (mb-objcopy)..." + "${MB_OBJCOPY}" -O binary "${PMUFW_ELF}" "${PMUFW_BIN}" + export PMUFW_BIN +fi + +echo "JTAG boot ZCU102 wolfIP app" +echo " xsdb : ${XSDB}" +echo " psu_init.tcl : ${PSU_INIT_TCL}" +echo " app.elf : ${APP_ELF}" +echo " app.bin : ${APP_BIN} ($(stat -c%s "${APP_BIN}") bytes)" +echo + +export APP_ELF APP_BIN PSU_INIT_TCL APP_LOAD_ADDR PMUFW_ELF + +"${XSDB}" "${SCRIPT_DIR}/boot.tcl" + +echo +echo "App is running. Watch UART:" +echo " uart-monitor tail ZYNQMP_ZCU102_UART0" diff --git a/src/port/amd/boards/zcu102/jtag/boot.tcl b/src/port/amd/boards/zcu102/jtag/boot.tcl new file mode 100644 index 00000000..eadb70fa --- /dev/null +++ b/src/port/amd/boards/zcu102/jtag/boot.tcl @@ -0,0 +1,225 @@ +# JTAG load of the wolfIP A53-0 bare-metal app on ZCU102. +# +# Pattern adapted from a known-working ZynqMP JTAG bare-metal loader +# (puf-provision/run.tcl). Key differences from earlier attempts that +# all failed silently: +# 1. Force JTAG bootmode via CSU register write (mwr 0xFF5E0200 0x0100). +# Without this, rst -system leaves the SoC in a state where dow +# eventually fails or the core won't resume. +# 2. Use psu_init.tcl directly (no FSBL stage). FSBL on this board has +# a JTAG-mode park (WFE deep-sleep) that 'con' cannot wake. +# 3. Use mwr -force per word to write the raw binary instead of dow. +# xsdb's dow path on DDR has a cache-flush dance that fails after +# psu_init runs. +# 4. Install a 'b .' bootloop at the A53 default RVBAR (0xFFFF0000) +# so rst -processor is safe and doesn't fly off into garbage. +# 5. After dow, target A53, rst -processor, stop, rwr pc, con. +# +# Env vars (set by jtag/boot.sh): +# APP_BIN path to the raw binary (objcopy -O binary app.elf app.bin) +# APP_ELF path to the ELF (for entry point reading) +# PSU_INIT_TCL path to psu_init.tcl + +set OCM_BASE 0xFFFC0000 +# DDR layout uses 0x10000000 (matches WOLFBOOT_LOAD_ADDRESS in +# wolfBoot's config/examples/zynqmp.config). The jtag/boot.sh script +# exports APP_LOAD_ADDR if set, otherwise defaults to the OCM base. +# Use scan to convert a hex string ("0x10000000") to an integer the +# rest of this script can compare and pass to mwr / dow. +if {[info exists env(APP_LOAD_ADDR)]} { + scan $env(APP_LOAD_ADDR) "%i" APP_LOAD_ADDR +} else { + set APP_LOAD_ADDR $OCM_BASE +} + +# Load a raw binary file to a target address via mwr -force, one +# 32-bit word at a time. Slow but reliable - bypasses xsdb's cache +# coherency logic that breaks dow on DDR after psu_init. +proc load_binary {bin_file base_addr} { + set fp [open $bin_file rb] + set data [read $fp] + close $fp + set len [string length $data] + + # Pad to 4-byte alignment. + set pad [expr {(4 - ($len % 4)) % 4}] + if {$pad > 0} { + append data [string repeat "\x00" $pad] + } + set padded [string length $data] + set words [expr {$padded / 4}] + + puts " loading [format %d $len] bytes ($words words) to [format 0x%08X $base_addr]" + + targets -set -nocase -filter {name =~ "*PSU*"} + for {set i 0} {$i < $words} {incr i} { + set off [expr {$i * 4}] + binary scan $data @${off}iu word + mwr -force [format "0x%X" [expr {$base_addr + $off}]] \ + [format "0x%X" [expr {$word & 0xFFFFFFFF}]] + if {($i % 8192) == 0 && $i > 0} { + puts " [expr {$i * 100 / $words}]%..." + } + } + puts " 100% done" + return $len +} + +# ---------------------------------------------------------------------- +# 1. Connect, system reset, force JTAG bootmode. +# ---------------------------------------------------------------------- +puts "Connecting..." +connect + +# Enumerate the JTAG chain explicitly. Without this poke, the DAP / +# PSU / APU targets are sometimes not visible immediately after the +# hw_server attach - 'targets' will only show PS TAP / PMU / PL. +puts "JTAG chain:" +jtag targets + +puts "All targets:" +targets + +puts "System reset..." +# On a cold board in JTAG boot mode the PSU/APU node is not enumerated +# until the PMU has been reset -- only PS TAP / PMU / PL are visible at +# connect. Select the PMU for the system reset; the PSU node (with the +# A53 cores) appears afterwards for the steps below. Filtering for +# "*PSU*" before this reset fails on a freshly powered board. +targets -set -nocase -filter {name =~ "PMU"} +rst -system +after 1500 + +# ---------------------------------------------------------------------- +# 1b. Load and start PMU firmware (MicroBlaze on the PMU). +# +# Without PMU FW, JTAG writes to DDR after psu_init are unreliable on +# this board -- the DDR controller training appears to need PMU +# coordination. Loading PMU FW via JTAG mirrors what the CSU +# BootROM would do during a normal SD/QSPI boot. Only do this if +# PMUFW_ELF is set in the environment; otherwise we keep the OCM-only +# behavior we had in Phase 1. We do this BEFORE the CSU JTAG-bootmode +# write because CSU touches PMU on the bootmode handshake. +# ---------------------------------------------------------------------- +if {[info exists env(PMUFW_BIN)]} { + puts "" + puts "Loading PMU FW: $env(PMUFW_BIN)" + # xsdb's `dow` fails on PMU MicroBlaze without a loaded XSA + # ("Invalid context"). Bypass it by writing the binary via + # mwr-force to PMU IRAM at 0xFFDC0000 -- same technique we use + # for the A53 app. The PMU's BootROM hands control to IRAM @ + # 0xFFDC0000 after we deassert PMU reset (psu_init touches PMU + # via CRL_APB.RST_LPD_TOP which keeps PMU running). + jtag targets + targets -set -nocase -filter {name =~ "PMU"} + stop + after 200 + load_binary $env(PMUFW_BIN) 0xFFDC0000 + con + after 1500 + puts "PMU FW running." +} + +puts "Forcing JTAG boot mode (CSU)..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF5E0200 0x0100 +after 1000 + +# ---------------------------------------------------------------------- +# 2. psu_init - DDR, clocks, MIO, UART, GEM3 pinmux. +# ---------------------------------------------------------------------- +puts "Sourcing psu_init.tcl..." +source $env(PSU_INIT_TCL) +puts "psu_init..." +psu_init +after 1000 +puts "psu_post_config..." +psu_post_config +after 500 + +# ---------------------------------------------------------------------- +# 3. UART0 baud init (FSBL would do this; psu_init alone doesn't). +# ---------------------------------------------------------------------- +puts "UART0 baud init (115200 8N1 at 100 MHz ref)..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF000000 0x03 ;# CR: TX_RST + RX_RST +mwr 0xFF000004 0x20 ;# MR: 8N1 +mwr 0xFF000018 124 ;# BAUDGEN: CD = 124 +mwr 0xFF000034 6 ;# BAUDDIV: BDIV = 6 +mwr 0xFF000000 0x114 ;# CR: TXEN + RXEN + STPBRK +after 100 + +# Banner write so we can see UART is live before our app starts. +foreach c [split "=== JTAG ready, loading app ===\r\n" ""] { + scan $c %c v + mwr -force 0xFF000030 $v +} +after 200 + +# ---------------------------------------------------------------------- +# 4. Load the wolfIP app. +# +# For the OCM layout we use mwr-force per-word (load_binary): the +# native xsdb `dow` path triggers a cache-flush dance that fails after +# psu_init when targeting OCM. For the DDR layout that workaround is +# not necessary -- the AXI master path is reliable to DDR once the +# DDR controller is up, and `dow` is much faster than the +# word-at-a-time fallback. Choose based on APP_LOAD_ADDR: anything +# >= 0xFF000000 is OCM/peripheral and gets the slow safe path; below +# that is DDR and uses dow on the ELF directly. +# ---------------------------------------------------------------------- +puts "" +puts "Loading: $env(APP_BIN) at [format 0x%X $APP_LOAD_ADDR] via mwr-force" +load_binary $env(APP_BIN) $APP_LOAD_ADDR +# Verify the first word landed. KNOWN ISSUE: with APP_LOAD_ADDR in DDR +# (e.g. 0x10000000), single-word mwr-force writes succeed but the +# bulk per-word loop in load_binary frequently shows the first word +# read back as something other than what we wrote, even with PMU FW +# running. The same xsdb cache/coherency dance that breaks `dow` over +# DDR after psu_init appears to be at play. The OCM target works +# reliably. Track this separately; the DDR path will be exercised +# end-to-end via SD/QSPI once wolfBoot's bootgen chain is set up. +if {$APP_LOAD_ADDR < 0xFF000000} { + set fp [open $env(APP_BIN) rb] + set head [read $fp 4] + close $fp + binary scan $head iu expect + set got [mrd -value -force [format 0x%X $APP_LOAD_ADDR]] + puts [format " verify: image\[0\]=0x%08X mem\[0\]=0x%08X %s" \ + $expect $got [expr {$expect == $got ? "OK" : "MISMATCH (known JTAG-DDR issue)"}]] +} + +# ---------------------------------------------------------------------- +# 5. Install RVBAR boot loop in OCM so rst -processor doesn't crash. +# ---------------------------------------------------------------------- +puts "" +puts "Installing RVBAR boot loop at 0xFFFF0000..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr -force 0xFFFF0000 0x14000000 ;# B . (branch to self, aarch64) +mwr -force 0xFFFF0004 0x14000000 + +# ---------------------------------------------------------------------- +# 6. A53 #0: reset, halt, set PC, continue. +# ---------------------------------------------------------------------- +puts "" +puts "Preparing A53 #0..." +targets -set -nocase -filter {name =~ "*A53*#0"} +rst -processor +after 200 +catch {stop} +after 200 +puts "PC after rst -processor (should be RVBAR 0xFFFF0000): [rrd pc]" + +set readelf [expr {[info exists env(READELF)] ? $env(READELF) : "aarch64-none-elf-readelf"}] +set entry [exec $readelf -h $env(APP_ELF) | grep "Entry point" | awk "{print \$NF}"] +puts "App ELF entry: $entry" +rwr pc $entry +puts "PC after rwr: [rrd pc]" + +puts "" +puts "con..." +con + +puts "Detaching, leaving app running." +disconnect +exit diff --git a/src/port/amd/boards/zcu102/jtag/boot_iter.sh b/src/port/amd/boards/zcu102/jtag/boot_iter.sh new file mode 100755 index 00000000..68d73194 --- /dev/null +++ b/src/port/amd/boards/zcu102/jtag/boot_iter.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# +# JTAG iteration helper: power-cycles the ZCU102, restarts hw_server, +# clears the UART log, JTAG-loads the app, and dumps the resulting +# UART output. Useful for headless iteration without physical access +# to the board. +# +# Everything that touches your specific bench is parameterised through +# env vars. Defaults are no-ops so you must set them per developer. +# +# Required env (in addition to whatever boot.sh requires): +# POWER_OFF_CMD - shell command to power the board OFF (e.g. +# "ssh pi@Pi4 'raspi-gpio set 20 op dl'") +# POWER_ON_CMD - shell command to power the board ON (e.g. +# "ssh pi@Pi4 'raspi-gpio set 20 op dh'") +# HW_SERVER - path to the Vitis hw_server binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/hw_server) +# UART_LABEL - uart-monitor board label for the ZCU102 USB-UART +# (e.g. ZYNQMP_ZCU102_UART0) +# +# Optional env: +# OFF_DELAY - seconds to hold OFF before ON (default 4) +# BOOT_DELAY - seconds to wait after ON before JTAG (default 10) +# POST_DELAY - seconds to wait after boot.sh before dumping +# UART (default 5) +# UART_LOG - path to the live log file +# (default /tmp/uart-monitor/latest/$UART_LABEL.log) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${POWER_OFF_CMD:?POWER_OFF_CMD is required (shell cmd that powers the board off)}" +: "${POWER_ON_CMD:?POWER_ON_CMD is required (shell cmd that powers the board on)}" +: "${HW_SERVER:?HW_SERVER is required (path to Vitis hw_server)}" +: "${UART_LABEL:?UART_LABEL is required (uart-monitor board label)}" +OFF_DELAY="${OFF_DELAY:-4}" +BOOT_DELAY="${BOOT_DELAY:-10}" +POST_DELAY="${POST_DELAY:-5}" +UART_LOG="${UART_LOG:-/tmp/uart-monitor/latest/${UART_LABEL}.log}" + +echo "=== Power cycle (POWER_OFF_CMD / POWER_ON_CMD) ===" +eval "${POWER_OFF_CMD}" +sleep "${OFF_DELAY}" +eval "${POWER_ON_CMD}" +echo "Powered on, waiting ${BOOT_DELAY}s for CSU bootROM..." +sleep "${BOOT_DELAY}" + +echo +echo "=== Restart hw_server (clears stale JTAG state) ===" +pkill -f hw_server || true +sleep 1 +"${HW_SERVER}" -d >/dev/null 2>&1 & +sleep 3 + +echo +echo "=== Clear UART log (${UART_LABEL}) ===" +uart-monitor clear "${UART_LABEL}" + +echo +echo "=== JTAG boot FSBL + app ===" +"${SCRIPT_DIR}/boot.sh" + +echo +echo "=== Waiting ${POST_DELAY}s for app to produce output ===" +sleep "${POST_DELAY}" + +echo +echo "=== UART output (${UART_LOG}) ===" +cat "${UART_LOG}" diff --git a/src/port/amd/boards/zcu102/target.ld b/src/port/amd/boards/zcu102/target.ld new file mode 100644 index 00000000..5a207c3b --- /dev/null +++ b/src/port/amd/boards/zcu102/target.ld @@ -0,0 +1,130 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script + * + * Memory map (current OCM-only layout): + * OCM : 256 KB @ 0xFFFC0000 (everything lives here) + * DDR low : 2 GB @ 0x00000000 (initialized by FSBL, currently unused + * by this app; reserved for future + * heap or larger ring buffers) + * + * App layout in OCM: + * 0xFFFC0000 .vectors (2 KB-aligned) + * ... .text, .rodata, .data, .bss, .page_tables, + * .dma_buffers (linker packs them in order) + * 0x100000000 _stack_top (top of OCM, stack grows down) + * + * Why OCM-only: + * - JTAG iteration: psu_init alone (no PMU FW) doesn't reliably + * bring up DDR for mwr-force loads. OCM is independent of the + * DDR controller and always works. + * - SD boot: bootgen will emit a warning about OCM overlap with + * FSBL, but FSBL's jump-to-image happens after partition load, + * so the overlay is safe. + * - The 16-KB JTAG DAP alias bug at the low DDR window is avoided + * entirely. + * + * .dma_buffers stays inside OCM (Normal-WB per L2_PERIPH[511]); GEM + * DMA coherency is handled with explicit DC CVAC / IVAC ops in gem.c + * rather than via an MMU attribute carve-out. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +/* Single-region layout: everything in OCM. DDR and DMA region + * definitions are kept as placeholders for a future layout that + * spills .dma_buffers (and possibly .bss) into DDR once the JTAG + * iteration path supports it. They are not referenced by any SECTION + * in the current layout. */ +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x001F0000 + DMA (rw) : ORIGIN = 0x00200000, LENGTH = 0x00200000 +} + +/* Stack at top of OCM (we keep stack in OCM with the rest since + * DDR-via-JTAG is unreliable without PMU FW). The 24 KB free area + * above DMA buffers gives plenty of stack room for our app. */ +_stack_top = 0x100000000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > OCM :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > OCM :text + + /* BSS in OCM as well - DDR-via-JTAG isn't reliable without PMU + * FW, so we keep all writeable state in OCM (256 KB total). */ + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > OCM :text + + /* Page tables in OCM so MMU walker isn't dependent on DDR being + * fully up (DDR-via-JTAG is unreliable without PMU FW; CPU + * fetch from OCM is bulletproof). 12 KB total (3x4KB tables). */ + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > OCM :text + + /* DMA buffers also in OCM - OCM is accessible to all AXI masters + * including the GEM DMA. With everything in OCM there's no DDR + * dependency for the basic bring-up. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/boards/zcu102/target_ddr.ld b/src/port/amd/boards/zcu102/target_ddr.ld new file mode 100644 index 00000000..879b670c --- /dev/null +++ b/src/port/amd/boards/zcu102/target_ddr.ld @@ -0,0 +1,122 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script - DDR layout + * + * Used when the app is loaded by wolfBoot (or any loader that places + * the signed image into DDR at a known LOAD_ADDRESS). FSBL + PMU FW + + * BL31 are all running by the time control reaches us, so the DDR + * controller is fully initialised and the DDR DAP 16-KB alias bug is + * a non-issue (the loader writes via the AXI master path). + * + * Memory map: + * DDR : 0x10000000 .. 0x10FFFFFF (16 MB; matches WOLFBOOT_LOAD_ADDRESS + * in wolfBoot's config/examples/zynqmp.config) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, still mapped Normal-WB + * executable by L2_PERIPH[511]; unused + * for this layout but left in MEMORY + * so MMU page-table addresses inside + * mmu.c remain valid) + * + * App layout in DDR (16 MB region @ 0x10000000): + * .vectors / .text / .rodata / .data / .bss / .page_tables / .dma_buffers + * stack grows down from _stack_top at the top of the region + * + * Stack top is set near the end of the DDR region with plenty of head + * room (4 MB) below for .bss + page tables + DMA buffers. Increase + * the LENGTH below if a larger heap or more DMA buffers are needed. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +MEMORY +{ + DDR (rwx) : ORIGIN = 0x10000000, LENGTH = 0x01000000 /* 16 MB */ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 /* still mapped */ +} + +/* Stack near the top of the DDR region. 16 MB - 4 KB gives the stack + * a safe red zone. */ +_stack_top = 0x10FFF000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + /* First 4 bytes of the image must be a `b _start` so wolfBoot's + * do_boot() (which branches to LOAD_ADDRESS, not the ELF entry) + * lands on a valid instruction. */ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > DDR :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > DDR :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > DDR :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > DDR :text + + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > DDR :text + + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > DDR :text + + /* DMA buffers get their own 2 MB-aligned block so the MMU can map + * just this range Normal-NC without making the .text block (which + * the CPU executes) non-cacheable or non-executable. NC is required + * for the GEM rings: the 8-byte BDs share 64-byte cache lines, so a + * cacheable ring with per-BD cache_clean writes stale neighbour BDs + * back over MAC-set OWN bits and wedges RX under sustained (TCP-rate) + * load. See mmu_aarch64.c (Skoll HIGH-2). */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(0x200000); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > DDR :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/boards/zynq7000/.gitignore b/src/port/amd/boards/zynq7000/.gitignore new file mode 100644 index 00000000..8e5ab963 --- /dev/null +++ b/src/port/amd/boards/zynq7000/.gitignore @@ -0,0 +1,4 @@ +*.o +*.elf +*.bin +BOOT.BIN diff --git a/src/port/amd/boards/zynq7000/Makefile b/src/port/amd/boards/zynq7000/Makefile new file mode 100644 index 00000000..c3f577db --- /dev/null +++ b/src/port/amd/boards/zynq7000/Makefile @@ -0,0 +1,80 @@ +# Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=arm-none-eabi- +# +# Toolchain: ARM GNU arm-none-eabi-gcc (tested with 13.2). +# +# Brought up on a ZC702 (Cortex-A9): DHCP, ICMP ping and UDP echo work. + +CROSS_COMPILE ?= arm-none-eabi- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../../../.. +AMD := $(ROOT)/src/port/amd +COMMON := $(AMD)/common +IP := $(AMD)/ip +ARCH := $(AMD)/arch/armv7 + +# Cortex-A9, ARMv7-A 32-bit, no NEON in cert paths. +CFLAGS := -mcpu=cortex-a9 -marm +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(COMMON) -I$(ARCH) -I$(IP) -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DZYNQ7000 -DXILINX_ARMV7 +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a9 -marm + +LDSCRIPT := target.ld +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Override newlib's memset/memcpy with bytewise variants in main.c +# (the same "fast memset uses an instruction the bare-metal setup +# does not tolerate" pattern we hit on the AArch64 port). +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := app.c board.c uart_cadence.c uart_util.c mmu_armv7.c gic_gicv2.c \ + gem_core.c board_gem.c gem_rx_poll.c \ + phy_dp83867.c phy_marvell.c phy_dispatch_multi.c entropy.c +LOCAL_S := startup_armv7.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +# Shared sources live outside this board dir; find them by vpath so the +# .o files still land here (keeps clean + JTAG app.elf-in-place working). +vpath %.c $(COMMON):$(ARCH):$(IP) +vpath %.S $(ARCH) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJS) app.elf BOOT.BIN + +.PHONY: all clean help + +help: + @echo "Zynq-7000 wolfIP build (Cortex-A9):" + @echo " make - build app.elf" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/amd/boards/zynq7000/README.md b/src/port/amd/boards/zynq7000/README.md new file mode 100644 index 00000000..5c5eaaf0 --- /dev/null +++ b/src/port/amd/boards/zynq7000/README.md @@ -0,0 +1,94 @@ +# wolfIP port: Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) + +**STATUS: brought up on a ZC702.** DHCP, ICMP ping and the UDP echo demo all work on real hardware (Cortex-A9, Marvell 88E1518 PHY). See "Hardware bring-up notes" below for the Zynq-7000-specific differences that mattered. + +## What this port is + +Bare-metal wolfIP port for the Xilinx Zynq-7000 family (Z-7020 etc., e.g. ZC702 / ZedBoard / MicroZed dev boards). Cortex-A9 in SVC mode, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS. Targets the same deterministic UDP/IPv4 profile as the ZCU102 port. + +## What differs from ZCU102 + +| Subsystem | ZCU102 (ZynqMP) | Zynq-7000 | Where it lives | +|-----------|-----------------|-----------|----------------| +| Architecture | ARMv8-A AArch64 | ARMv7-A 32-bit | toolchain prefix | +| CPU core | Cortex-A53 | Cortex-A9 | `Makefile` (-mcpu) | +| Bootloader handoff | FSBL -> EL3 | FSBL -> SVC | `startup_armv7.S` | +| Toolchain | `aarch64-none-elf-gcc` | `arm-none-eabi-gcc` | `Makefile` | +| Exception model | EL3 vectors | ARMv7 exception modes | `startup_armv7.S` rewritten | +| MMU | 4-level long descriptor | 1-level short descriptor | `mmu_armv7.c` rewritten | +| Cache ops | DC CVAC / DC IVAC | MCR p15 c7 (DCCMVAC/DCIMVAC) | `gem_core.c` | +| Generic timer | `mrs cntpct_el0` | `mrrc p15, 0, ..., c14` | `timer.h`, `entropy.c` | +| GIC | GIC-400 (GICv2) | GIC-390 (GICv2) | `gic_gicv2.c` (same driver, different base) | +| GIC base addrs | `0xF901xxxx` | `0xF8F0xxxx` | `board.h` | +| UART | Cadence at 0xFF000000 | Cadence at 0xE0000000 | `board.h` (same driver) | +| Clock + reset | CRL_APB at 0xFF5E0000 | SLCR at 0xF8000000 | `board.h` (gem.c clock helper needs rewrite) | +| GEM count | 4 (GEM0-3) | 2 (GEM0-1) | `board.h` | +| On-board RJ45 | GEM3 (INTID 95) | GEM0 (INTID 54) | `board.h` | +| BD format | 8-byte (DMACR[30]=0) | 8-byte (no 64-bit option) | `gem_core.c` (unchanged) | + +## Build + +``` +cd src/port/amd/boards/zynq7000 +make CROSS_COMPILE=arm-none-eabi- +``` + +Output: `app.elf`. + +## JTAG boot (ZC702) + +The ZC702 boots its onboard JTAG over the Digilent USB module; set SW10 +to the on-board (USB) JTAG position and SW16 to JTAG boot mode, then: + +``` +XSDB=/opt/Xilinx//Vitis/bin/xsdb \ +FSBL_ELF=/path/to/zynq_fsbl.elf \ +./jtag/boot.sh +``` + +`jtag/boot.tcl` runs the prebuilt FSBL (ps7_init brings up DDR/MIO/clocks/ +UART), remaps all four OCM banks high (`SLCR.OCM_CFG`) so the app can load +at `0xFFFC0000`, then loads `app.elf` and starts it in SVC mode. The +console is on **UART1** (the ZC702 USB-UART), not UART0. After a run the +A9 must be power-cycled to be JTAG-loadable again. + +## Hardware bring-up notes (what was Zynq-7000-specific) + +These are the things that differed from the AArch64 ports and had to be +fixed for the ZC702 to reach DHCP/ping/echo: + +- **No ARM generic timer.** The Cortex-A9 does not implement CNTPCT/CNTFRQ + (CP15 c14); those encodings are UNDEFINED and trap. `timer.h` and + `entropy.c` use the MPCore **Global Timer** at `0xF8F00200` (333 MHz) + instead. +- **Console is UART1.** The ZC702 routes the USB console to Cadence UART1 + (`0xE0001000`); `uart_cadence.c` trusts the FSBL's baud config rather than + reprogramming the divisor (the UART_REF_CLK is not the ZynqMP value). +- **Marvell 88E1518 PHY, not DP83867.** The ZC702 fits a Marvell PHY + (OUI `0x0141`) at MDIO addr 7. `phy_marvell.c` handles its paged RGMII + delay registers + autoneg; `gem_core.c` dispatches on the PHY ID. +- **GEM clock via SLCR, write-protected.** `SLCR.GEM0_CLK_CTRL` + (`0xF8000140`) has a different layout than ZynqMP's CRL_APB and is + write-locked. `gem0_set_ref_clk` unlocks the SLCR (`0xDF0D`) and writes + `0x00100801` for 125 MHz (1 Gbps). `SLCR.GEM0_RCLK_CTRL` (`0xF8000138`) + must also be set to source the RGMII RX clock from the PHY, or the MAC + receives nothing (matches Xilinx ps7_init). +- **Poll-driven RX, GEM IRQ masked.** Unlike the Versal GICv3, the A9 GIC + delivers the GEM SPI, and an enabled RX-complete interrupt storms the + CPU. RX is polled from `eth_poll` and the GEM interrupt is left masked. +- **Non-cacheable OCM for DMA.** The 8-byte GEM descriptors share 32-byte + cache lines, so per-descriptor cache maintenance corrupts neighbours' + OWN bits and stalls RX. The OCM section is mapped Normal non-cacheable + (`mmu_armv7.c`) so the descriptor rings and buffers are DMA-coherent. (The + PL310 L2 is also disabled as a belt-and-braces measure.) Note the A9 + L1 cache line is 32 bytes, not the 64 of the AArch64 cores. +- `NWCFG_DWIDTH_64` (NWCFG bit 21) is set to mirror the validated + U-Boot/Linux register state, but is a no-op here: the A9 GEM AXI + master path is 32-bit and the BDs stay 8 bytes (DMACR[30]=0). + +## Files + +See `src/port/amd/README.md` for the shared-tree layout. The Zynq-7000 +extras are `ip/phy_marvell.c` / `phy_marvell.h` (the ZC702 PHY) selected +via `ip/phy_dispatch_multi.c`, and this board dir's `jtag/` (FSBL-based +JTAG loader). diff --git a/src/port/amd/boards/zynq7000/board.c b/src/port/amd/boards/zynq7000/board.c new file mode 100644 index 00000000..72f6bbcc --- /dev/null +++ b/src/port/amd/boards/zynq7000/board.c @@ -0,0 +1,21 @@ +/* board.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Zynq-7000 (ZC702) board hooks for the shared demo (app.c). + */ +#include "app.h" + +const char *board_banner(void) +{ + return "\n\n=== wolfIP Zynq-7000 (Cortex-A9 SVC) ===\n" + "MMU on, caches on. Bringing up GIC-390...\n"; +} + +void board_irq_setup(void) +{ + /* RX is poll-driven and the GEM interrupt is left masked, so there is + * nothing to unmask at the CPU here. */ +} diff --git a/src/port/amd/boards/zynq7000/board.h b/src/port/amd/boards/zynq7000/board.h new file mode 100644 index 00000000..de1771b1 --- /dev/null +++ b/src/port/amd/boards/zynq7000/board.h @@ -0,0 +1,121 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) PS register base + * addresses and GIC interrupt IDs. Derived from the Zynq-7000 TRM + * (UG585). No Xilinx Standalone BSP header is required. + * + * Brought up on a ZC702 (Cortex-A9). Mirrors src/port/zcu102/ + * structurally. Key differences: + * - Cortex-A9 (not A53), ARMv7-A 32-bit (not AArch64) + * - SLCR replaces ZynqMP's CRL_APB + * - GIC-390 (GICv2) inside the SCU at different base addresses + * - Cadence UART (same IP as ZynqMP; different base address) + * - Cadence GEM (older revision; 32-bit BD format default) + * - 2 GEMs (GEM0 / GEM1); on-board RJ45 is typically GEM0 + */ +#ifndef ZYNQ7000_BOARD_H +#define ZYNQ7000_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (Zynq-7000 PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x40000000UL /* 1 GB typical, e.g. ZC702 */ + +/* OCM is mappable to 0x00000000 (low) or 0xFFFC0000 (high). Most + * bare-metal apps use the high mapping; FSBL configures the OCM + * address filter via SLCR.OCM_CFG. We assume the high mapping. */ +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL /* 256 KB */ + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xE0000000UL /* Cadence */ +#define UART1_BASE 0xE0001000UL + +/* Console UART: ZC702 routes UART1 to the on-board USB-UART. The FSBL + * already set the baud for the board ref clock; do not reprogram. */ +#define UART_BASE UART1_BASE + +#define GEM0_BASE 0xE000B000UL /* on-board RJ45 typical */ + +/* On-board RJ45 is GEM0 on the ZC702. */ +#define GEM_BASE GEM0_BASE +#define IRQ_GEM IRQ_GEM0 +#define GEM1_BASE 0xE000C000UL + +#define SLCR_BASE 0xF8000000UL /* clock + reset */ + +/* GIC-390 (ARMv7 GICv2 compatible). Distributor + CPU IF are in the + * SCU (Snoop Control Unit) memory region on Zynq-7000. */ +#define GICD_BASE 0xF8F01000UL +#define GICC_BASE 0xF8F00100UL + +/* --------------------------------------------------------------------- + * GIC interrupt IDs (raw GIC INTIDs, not GIC_SPI offsets). + * Per Zynq-7000 TRM Table 7-3: + * GEM0: INTID 54 + * GEM1: INTID 77 + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 54 +#define IRQ_GEM1 77 + +/* --------------------------------------------------------------------- + * SLCR clock and reset registers + * ------------------------------------------------------------------- */ +#define SLCR_LOCK (SLCR_BASE + 0x004) +#define SLCR_UNLOCK (SLCR_BASE + 0x008) +#define SLCR_GEM0_CLK_CTRL (SLCR_BASE + 0x140) +#define SLCR_GEM0_RCLK_CTRL (SLCR_BASE + 0x138) /* RGMII RX clock src */ +#define SLCR_GEM1_CLK_CTRL (SLCR_BASE + 0x144) +#define SLCR_GEM_RST_CTRL (SLCR_BASE + 0x214) + +#define SLCR_UNLOCK_KEY 0xDF0D /* per TRM */ + +/* --------------------------------------------------------------------- + * Cadence UART0 baud + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* MAC address for eth0. Locally-administered, even first octet. */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* ZYNQ7000_BOARD_H */ diff --git a/src/port/amd/boards/zynq7000/board_gem.c b/src/port/amd/boards/zynq7000/board_gem.c new file mode 100644 index 00000000..c331469b --- /dev/null +++ b/src/port/amd/boards/zynq7000/board_gem.c @@ -0,0 +1,96 @@ +/* board_gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Zynq-7000 GEM clock/reset hooks for the shared GEM core. The GEM0 + * reference clock and reset live in the SLCR (write-locked, different + * layout from ZynqMP's CRL_APB). Also disables the PL310 L2 before MAC + * config so the bare-metal L1-only cache ops give correct DMA coherency. + */ +#include +#include "board.h" +#include "gem_port.h" +#include "timer.h" /* delay_us / delay_ms - deterministic, counter-backed */ + +#define SLCR_GEM0_RST_BIT (1u << 3) /* GEM0 reset bit in GEM_RST_CTRL */ + +/* The FSBL enables the PL310 L2 cache, but this port's L1-only cache ops + * only maintain the A9 L1 to the PoC. Once a GEM descriptor lands in L2, + * the MAC's DMA writes to it are invisible to the CPU (stale L2 copy), + * stalling RX after the first burst. Disable L2 so the PoC is main + * memory. PL310 controller is at 0xF8F02000 on Zynq-7000. */ +#define PL310_BASE 0xF8F02000u +#define PL310_CONTROL (*(volatile uint32_t *)(PL310_BASE + 0x100)) +#define PL310_CLEAN_INV_WAY (*(volatile uint32_t *)(PL310_BASE + 0x7FC)) +#define PL310_CACHE_SYNC (*(volatile uint32_t *)(PL310_BASE + 0x730)) +#define PL310_ALL_WAYS 0x000000FFu /* 8-way associative on Z-7000 */ + +static void pl310_l2_disable(void) +{ + if ((PL310_CONTROL & 1u) == 0u) + return; /* already disabled */ + PL310_CLEAN_INV_WAY = PL310_ALL_WAYS; + while (PL310_CLEAN_INV_WAY & PL310_ALL_WAYS) + ; /* wait for clean+invalidate */ + PL310_CACHE_SYNC = 0u; + PL310_CONTROL = 0u; /* disable L2 */ + __asm__ volatile ("dsb" ::: "memory"); +} + +void gem_soc_pre_init(void) +{ + pl310_l2_disable(); +} + +/* Configure SLCR.GEM0_CLK_CTRL for the negotiated link speed. With + * SRCSEL=IO_PLL (~1000 MHz) and DIVISOR0=8 the base is 125 MHz, so + * DIVISOR1 selects the line rate: 1 -> 125 MHz (1G), 5 -> 25 MHz (100M), + * 50 -> 2.5 MHz (10M). The SLCR is write-protected; unlock it first. */ +void gem_set_ref_clk(int speed_mbps) +{ + volatile uint32_t *unlock = (volatile uint32_t *)SLCR_UNLOCK; + volatile uint32_t *gem0_clk = (volatile uint32_t *)SLCR_GEM0_CLK_CTRL; + volatile uint32_t *gem0_rclk = (volatile uint32_t *)SLCR_GEM0_RCLK_CTRL; + uint32_t div1; + uint32_t val; + + switch (speed_mbps) { + case 1000: div1 = 1; break; + case 100: div1 = 5; break; + case 10: div1 = 50; break; + default: div1 = 1; break; + } + val = ((div1 & 0x3Fu) << 20) /* DIVISOR1 */ + | ((8u & 0x3Fu) << 8) /* DIVISOR0 = 8 (IO_PLL/8 = 125 MHz) */ + | (0u << 4) /* SRCSEL = IO_PLL */ + | (1u << 0); /* CLKACT */ + *unlock = SLCR_UNLOCK_KEY; + /* GEM0_RCLK_CTRL: source the RGMII RX clock from the PHY's RXC pin via + * MIO (SRCSEL=0) and enable it (CLKACT=1), or the MAC receives + * nothing. Matches the Xilinx ps7_init write. */ + *gem0_rclk = (*gem0_rclk & ~0x11u) | 0x01u; + *gem0_clk = val; + /* Re-lock the SLCR so stray writes can't scribble the clock/reset/ + * pinmux block for the rest of runtime. */ + *(volatile uint32_t *)SLCR_LOCK = 0x767Bu; +} + +/* Pulse the GEM0 reset bit so the MAC starts from a known state, then + * force the 125 MHz reference (amd_eth_init downshifts later if needed). */ +void gem_clk_reset(void) +{ + volatile uint32_t *rst = (volatile uint32_t *)SLCR_GEM_RST_CTRL; + volatile uint32_t *unlock = (volatile uint32_t *)SLCR_UNLOCK; + + /* The SLCR is write-protected; the reset writes below are silently + * dropped unless we unlock first. gem_set_ref_clk() re-locks it. */ + *unlock = SLCR_UNLOCK_KEY; + *rst |= SLCR_GEM0_RST_BIT; + delay_us(10); /* hold the reset asserted */ + *rst &= ~SLCR_GEM0_RST_BIT; + delay_ms(10); /* settle after deassert (counter-backed) */ + + gem_set_ref_clk(1000); +} diff --git a/src/port/amd/boards/zynq7000/config.h b/src/port/amd/boards/zynq7000/config.h new file mode 100644 index 00000000..d699fbd5 --- /dev/null +++ b/src/port/amd/boards/zynq7000/config.h @@ -0,0 +1,31 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx Zynq-7000 (Cortex-A9, SVC mode + * bare-metal). Shared AMD/Xilinx profile lives in common/wolfip_config.h. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +/* Per-board overrides (if any) go here, before the shared profile. */ + +#include "wolfip_config.h" + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/amd/boards/zynq7000/jtag/boot.sh b/src/port/amd/boards/zynq7000/jtag/boot.sh new file mode 100755 index 00000000..3ed494fc --- /dev/null +++ b/src/port/amd/boards/zynq7000/jtag/boot.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# +# Boot the wolfIP Zynq-7000 (ZC702) app via JTAG. Assumes a hw_server +# reachable on localhost (the default when Vitis is local). +# +# The prebuilt FSBL brings the PS up (ps7_init: DDR/MIO/clocks/UART) and +# parks; jtag/boot.tcl then loads the app ELF over the top and starts it +# in SVC mode. No psu_init.tcl and no PDI -- Zynq-7000 has neither. +# +# Set the ZC702 boot-mode straps to JTAG (SW16 = all OFF) and power-cycle +# before use. +# +# Required env (no built-in defaults; set per-developer): +# XSDB - path to Vitis xsdb binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/xsdb) +# FSBL_ELF - path to a prebuilt Zynq-7000 FSBL ELF (e.g. from +# wolfSSL/soc-prebuilt-firmware zc702-zynq/zynq_fsbl.elf, +# or a Vitis/PetaLinux build) +# +# Optional env (sensible defaults): +# APP_ELF - default: ${PORT_DIR}/app.elf +# READELF - default: arm-none-eabi-readelf (reads the app entry) +# +# Usage (from the port directory): +# XSDB=/opt/Xilinx/2025.2/Vitis/bin/xsdb \ +# FSBL_ELF=/path/to/zynq_fsbl.elf \ +# ./jtag/boot.sh +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${XSDB:?XSDB is required (path to Vitis xsdb binary)}" +: "${FSBL_ELF:?FSBL_ELF is required (path to a prebuilt Zynq-7000 FSBL ELF)}" +APP_ELF="${APP_ELF:-${PORT_DIR}/app.elf}" +export READELF="${READELF:-arm-none-eabi-readelf}" + +if ! command -v "${XSDB}" >/dev/null 2>&1 && [[ ! -x "${XSDB}" ]]; then + echo "ERROR: xsdb not found / not executable: ${XSDB}" >&2 + exit 1 +fi +if [[ ! -f "${FSBL_ELF}" ]]; then + echo "ERROR: FSBL_ELF not found at ${FSBL_ELF}" >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: app.elf not found at ${APP_ELF}. Run 'make' first." >&2 + exit 1 +fi + +echo "JTAG boot Zynq-7000 (ZC702) wolfIP app" +echo " xsdb : ${XSDB}" +echo " fsbl : ${FSBL_ELF}" +echo " app.elf : ${APP_ELF}" +echo + +export APP_ELF FSBL_ELF + +"${XSDB}" "${SCRIPT_DIR}/boot.tcl" + +echo +echo "App is running. Watch UART:" +echo " uart-monitor tail " diff --git a/src/port/amd/boards/zynq7000/jtag/boot.tcl b/src/port/amd/boards/zynq7000/jtag/boot.tcl new file mode 100644 index 00000000..bcbd1d35 --- /dev/null +++ b/src/port/amd/boards/zynq7000/jtag/boot.tcl @@ -0,0 +1,86 @@ +# JTAG load of the wolfIP Cortex-A9 bare-metal app on Zynq-7000 (ZC702). +# +# Zynq-7000 has no PMU/PLM. The prebuilt FSBL brings the PS up (ps7_init: +# DDR, MIO pinmux, clocks, UART) and then parks itself (no bundled +# second stage). We run the FSBL, stop where it parks, then load our app +# over the top and start it in SVC mode. Pattern adapted from the +# wolfBoot Zynq-7000 jtag_load.tcl. +# +# Set the ZC702 boot-mode straps to JTAG (SW16 = all OFF) and power-cycle +# before use. After a run the board may need a power-cycle to recover the +# CPU into a JTAG-loadable state. +# +# Env vars (set by jtag/boot.sh): +# APP_ELF path to app.elf +# FSBL_ELF path to a prebuilt Zynq-7000 FSBL ELF +# READELF arm-none-eabi-readelf (to read the app entry point) + +set APP_ELF $env(APP_ELF) +set FSBL_ELF $env(FSBL_ELF) +set readelf [expr {[info exists env(READELF)] ? $env(READELF) : "arm-none-eabi-readelf"}] + +puts "Connecting..." +connect + +# The chain sometimes comes up empty if a previous run left the CPU +# off-chain (WFI + clock gated). Retry the A9 target selection. +set selected 0 +for {set i 0} {$i < 5} {incr i} { + set rc [catch {targets -set -filter {name =~ "ARM Cortex-A9 MPCore #0"}} err] + if {$rc == 0} { set selected 1; break } + puts "Cortex-A9 select failed (try $i): $err" + after 500 +} +if {!$selected} { + puts "ERROR: could not select Cortex-A9 target after 5 retries." + puts "Power-cycle the ZC702 and try again." + exit 1 +} + +# Full PS reset, then wait for the BootROM to enter its JTAG-mode poll +# loop before loading the FSBL. +puts "System reset..." +rst -system +after 1500 +targets -set -filter {name =~ "ARM Cortex-A9 MPCore #0"} + +# Run the FSBL to completion: ps7_init (DDR/MIO/clocks/UART), then it +# parks. 3 s is plenty. +puts "Loading FSBL: $FSBL_ELF" +dow $FSBL_ELF +con +after 3000 + +# Stop where the FSBL parked. Do NOT rst -processor here -- that drops +# back into the BootROM and loses the FSBL's PS configuration. +stop + +# The OCM-layout app links at 0xFFFC0000, but the FSBL leaves the OCM +# banks at the reset mapping (banks 0-2 at 0x00000000, bank 3 at +# 0xFFFF0000), so 0xFFFC0000-0xFFFEFFFF is unmapped and `dow` fails with +# "OCM is not enabled at 0xFFFC0000". Map all four 64 KB OCM banks to the +# high address so the whole 256 KB sits contiguously at 0xFFFC0000. +# SLCR must be unlocked first (0xDF0D), then re-locked (0x767B). +puts "Mapping OCM high (SLCR.OCM_CFG = 0x0F)..." +mwr -force 0xF8000008 0x0000DF0D ;# SLCR_UNLOCK +mwr -force 0xF8000910 0x0000000F ;# OCM_CFG: RAM_HI for all 4 banks +mwr -force 0xF8000004 0x0000767B ;# SLCR_LOCK + +# Load the app. xsdb's `dow` does not reliably set the PC on a second +# target dow, so set PC (app entry) and CPSR (SVC, IRQ/FIQ masked) +# explicitly. ARMv7-A reset/SVC convention, unlike the AArch64 ports. +puts "Loading app: $APP_ELF" +dow $APP_ELF +set entry [exec $readelf -h $APP_ELF | grep "Entry point" | awk "{print \$NF}"] +puts "App ELF entry: $entry" +rwr pc $entry +rwr cpsr 0xD3 ;# SVC mode, IRQ+FIQ masked +puts "PC=[rrd pc] CPSR=[rrd cpsr]" + +puts "" +puts "con... watch UART1 (115200 8N1) for the wolfIP banner." +con + +puts "Detaching, leaving app running." +disconnect +exit diff --git a/src/port/amd/boards/zynq7000/target.ld b/src/port/amd/boards/zynq7000/target.ld new file mode 100644 index 00000000..d2c08e34 --- /dev/null +++ b/src/port/amd/boards/zynq7000/target.ld @@ -0,0 +1,98 @@ +/* Zynq-7000 (Cortex-A9) Linker Script - OCM-only layout + * + * Memory map (Zynq-7000 PS): + * DDR : 0x00000000 .. 0x3FFFFFFF (1 GB on ZC702 / Zynq-7020 typical) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, high-mapped by SLCR.OCM_CFG) + * + * We keep everything in OCM by default for JTAG-iteration symmetry + * with the ZCU102 port. The page tables alone take 16 KB (4096 + * section descriptors x 4 bytes) so we have less spare OCM than the + * AArch64 builds; track sizes and spill to DDR when needed. + * + * Brought up on a ZC702 (Cortex-A9). + */ + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) +ENTRY(_start) + +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x10000000 +} + +_stack_top = 0xFFFFF000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(32); /* VBAR alignment requirement */ + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(4); + *(.text*) + *(.rodata*) + . = ALIGN(4); + } > OCM :text + + .data : + { + . = ALIGN(4); + _sdata = .; + *(.data*) + . = ALIGN(4); + _edata = .; + } > OCM :text + + .bss (NOLOAD) : + { + . = ALIGN(4); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(4); + _ebss = .; + } > OCM :text + + .page_tables (NOLOAD) : + { + . = ALIGN(16384); /* TTBR0 wants 16 KB alignment */ + _page_tables_start = .; + *(.page_tables) + . = ALIGN(16384); + _page_tables_end = .; + } > OCM :text + + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/common/app.c b/src/port/amd/common/app.c new file mode 100644 index 00000000..cb8c0a97 --- /dev/null +++ b/src/port/amd/common/app.c @@ -0,0 +1,433 @@ +/* app.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Shared wolfIP UDP-echo + DHCP-client demo for the AMD/Xilinx ports + * (ZynqMP / Versal / Zynq-7000). Board-specific bits (startup banner, + * whether to unmask CPU IRQs) come from boards//board.c via app.h. + */ +#include +#include +#include "config.h" +#include "wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "timer.h" +#include "app.h" + +#define ECHO_PORT 7 +#define RX_BUF_SIZE 1500 + +#ifdef SPEED_TEST +#define SPEED_PORT 9 /* discard/chargen-style TCP throughput port */ +#endif + +static struct wolfIP *IPStack; +#ifndef SPEED_TEST +static int udp_fd = -1; +static uint8_t udp_rx_buf[RX_BUF_SIZE]; +#endif + +/* Monotonic wall-clock milliseconds from the hardware timer. wolfIP needs a + * real-millisecond tick for its DHCP/TCP/ARP timers; a free-running counter + * (tick++) only approximates one and skews every timeout. timer_now() is a + * raw up-counter at timer_freq() Hz. The seconds/remainder split avoids the + * 64-bit overflow that a plain (ticks * 1000) would hit at long uptimes + * (the remainder term stays below freq * 1000); it is exactly equal to + * (ticks * 1000) / freq for all inputs. */ +static uint64_t app_now_ms(void) +{ + uint64_t ticks = timer_now(); + uint64_t freq = timer_freq(); + + return (ticks / freq) * 1000ULL + ((ticks % freq) * 1000ULL) / freq; +} + +/* Override newlib memset/memcpy with our own versions via linker --wrap. + * The AArch64 newlib memset uses 'dc zva' which hangs on these bare-metal + * setups; the ARMv7 override is kept defensively. The Makefile passes + * -Wl,--wrap=memset -Wl,--wrap=memcpy. + * + * These do an 8-byte-at-a-time bulk loop with a bytewise tail (and never + * use 'dc zva'). On the frame-staging hot path the buffers are 64-byte + * aligned so the word loop is taken; the byte tail handles the rest. The + * OCM is mapped Normal-NC (SCTLR.A=0), so the only requirement for the + * 64-bit access is natural alignment, which the runtime check enforces. */ +void *__wrap_memset(void *s, int c, size_t n) +{ + unsigned char *p = (unsigned char *)s; + unsigned char cb = (unsigned char)c; + uint64_t w; + + if ((((uintptr_t)p) & 7u) == 0u) { + w = (uint64_t)cb * 0x0101010101010101ULL; + while (n >= 8u) { + *(uint64_t *)p = w; + p += 8; + n -= 8u; + } + } + while (n--) + *p++ = cb; + return s; +} + +void *__wrap_memcpy(void *dest, const void *src, size_t n) +{ + unsigned char *d = (unsigned char *)dest; + const unsigned char *s = (const unsigned char *)src; + + if (((((uintptr_t)d) | ((uintptr_t)s)) & 7u) == 0u) { + while (n >= 8u) { + *(uint64_t *)d = *(const uint64_t *)s; + d += 8; + s += 8; + n -= 8u; + } + } + while (n--) + *d++ = *s++; + return dest; +} + +/* wolfIP needs a 32-bit random word for protocol identifiers (TCP ISN, + * DHCP xid, DNS id, ephemeral source port, IP fragment id). We delegate + * to the port-local memuse-pattern entropy source (entropy.c). */ +extern uint32_t amd_get_random32(void); + +uint32_t wolfIP_getrandom(void) +{ + return amd_get_random32(); +} + +#ifndef SPEED_TEST +static void udp_echo_cb(int fd, uint16_t event, void *arg) +{ + struct wolfIP *s = (struct wolfIP *)arg; + struct wolfIP_sockaddr_in peer; + uint32_t peer_len = sizeof(peer); + int n; + + if (!(event & CB_EVENT_READABLE)) + return; + + n = wolfIP_sock_recvfrom(s, fd, udp_rx_buf, sizeof(udp_rx_buf), 0, + (struct wolfIP_sockaddr *)&peer, &peer_len); + if (n > 0) { + (void)wolfIP_sock_sendto(s, fd, udp_rx_buf, (uint32_t)n, 0, + (struct wolfIP_sockaddr *)&peer, peer_len); + uart_puts("UDP echo: "); uart_putdec((uint32_t)n); + /* sin_addr.s_addr is network byte order (BSD convention); the + * uart_putip4 helper, like atoip4/iptoa, expects host byte order + * (first octet in the high byte), so swap before printing. */ + uart_puts(" bytes from "); uart_putip4(ee32(peer.sin_addr.s_addr)); + uart_puts("\n"); + } +} +#else /* SPEED_TEST */ + +/* TCP throughput server on SPEED_PORT (mirrors the va416xx SPEED_TEST harness). + * One connection at a time: every byte the host sends is sunk (RX test) and, + * whenever the socket is writable, a chargen-style buffer is pushed (TX test). + * On close the totals and an average rate are printed. Measure from a host on + * the same subnet: + * RX (board sinks): dd if=/dev/zero bs=1460 count=N | nc 9 + * TX (board sources): nc 9 /dev/null + */ +static int speed_listen_fd = -1; +static int speed_client_fd = -1; +static uint64_t speed_rx_bytes; +static uint64_t speed_tx_bytes; +static uint64_t speed_start_ms; +static uint8_t speed_buf[RX_BUF_SIZE]; + +static void speed_print_result(void) +{ + uint64_t elapsed = app_now_ms() - speed_start_ms; + uint64_t rx_bps = 0, tx_bps = 0; + + if (elapsed == 0) + elapsed = 1; + rx_bps = (speed_rx_bytes * 1000ULL) / elapsed; + tx_bps = (speed_tx_bytes * 1000ULL) / elapsed; + + uart_puts("SPEED done after "); uart_putdec((uint32_t)elapsed); + uart_puts(" ms\n RX "); uart_putdec((uint32_t)speed_rx_bytes); + uart_puts(" bytes (~"); uart_putdec((uint32_t)rx_bps); + uart_puts(" B/s)\n TX "); uart_putdec((uint32_t)speed_tx_bytes); + uart_puts(" bytes (~"); uart_putdec((uint32_t)tx_bps); + uart_puts(" B/s)\n"); +} + +static void speed_cb(int fd, uint16_t event, void *arg) +{ + struct wolfIP *s = (struct wolfIP *)arg; + int n; + + if (fd == speed_listen_fd) { + if (event & CB_EVENT_READABLE) { + int c = wolfIP_sock_accept(s, speed_listen_fd, NULL, NULL); + if (c >= 0) { + /* Single-client server: if a measurement is already running, + * reject the newcomer rather than overwrite speed_client_fd + * (which would orphan the active socket - its callback is + * still speed_cb but the fd != speed_client_fd check would + * then ignore it, so it would never be closed). This keeps + * the invariant correct independent of MAX_TCPSOCKETS. */ + if (speed_client_fd >= 0) { + (void)wolfIP_sock_close(s, c); + } else { + speed_client_fd = c; + speed_rx_bytes = 0; + speed_tx_bytes = 0; + speed_start_ms = app_now_ms(); + wolfIP_register_callback(s, c, speed_cb, s); + uart_puts("SPEED client connected\n"); + } + } + } + return; + } + + if (fd != speed_client_fd) + return; +#ifdef SPEED_DEBUG + if (event & CB_EVENT_READABLE) uart_putc('r'); + if (event & CB_EVENT_WRITABLE) uart_putc('w'); + if (event & CB_EVENT_CLOSED) uart_putc('C'); +#endif + + /* RX: drain the whole receive buffer on each READABLE. The event is + * edge-triggered, so reading just one chunk leaves the rest buffered; + * once the advertised TCP window fills the peer stops sending, no new + * READABLE fires, and the connection deadlocks (observed as a stuck + * ~2 KB snd_wnd on the sender). The loop is bounded - recvfrom returns + * <=0 when the buffer is empty - and reopening the buffer lets wolfIP + * advertise a fresh window. */ + if (event & CB_EVENT_READABLE) { + /* Bounded drain: empty up to a full window's worth of buffered data + * (RXBUF_SIZE / MSS chunks, plus slack), then return to wolfIP_poll + * so it can send the window-update ACK. An unbounded loop here never + * yields to the poll, so the ACK is never sent and the transfer + * stalls / the CPU spins. */ + int drains = 0; + do { + n = wolfIP_sock_recvfrom(s, fd, speed_buf, sizeof(speed_buf), 0, + NULL, NULL); + if (n > 0) + speed_rx_bytes += (uint64_t)n; + } while (n > 0 && ++drains < 32); +#ifdef SPEED_DEBUG + if (drains >= 32) + uart_putc('!'); /* hit the cap - more buffered than a window */ +#endif + } + + /* TX: bounded fill of the tx buffer each WRITABLE, then yield to poll + * so it can flush to the wire. sock_send returns <=0 once the tx buffer + * is full, so the loop self-limits well below the cap; the cap is just a + * backstop against an unbounded spin. */ + if (event & CB_EVENT_WRITABLE) { + int fills = 0; + do { + n = wolfIP_sock_send(s, fd, speed_buf, sizeof(speed_buf), 0); + if (n > 0) + speed_tx_bytes += (uint64_t)n; + } while (n > 0 && ++fills < 32); + } + + if (event & CB_EVENT_CLOSED) { + speed_print_result(); + (void)wolfIP_sock_close(s, fd); + speed_client_fd = -1; + } +} +#endif /* SPEED_TEST */ + +int main(void) +{ + struct wolfIP_ll_dev *ll; + struct wolfIP_sockaddr_in addr; + int ret; + + uart_init(); + uart_puts(board_banner()); + + gic_init(); + + uart_puts("Initializing wolfIP stack...\n"); + wolfIP_init_static(&IPStack); + + uart_puts("Bringing up GEM (RGMII)...\n"); + ll = wolfIP_getdev(IPStack); + ret = amd_eth_init(ll); + if (ret < 0) { + uart_puts("ERROR: amd_eth_init failed: "); + uart_puthex((uint32_t)ret); + uart_puts("\n"); + while (1) + ; + } + uart_puts(" link "); uart_puts((ret & 0x100) ? "UP" : "DOWN"); + uart_puts(", PHY="); uart_puthex((uint32_t)(ret & 0xFF)); + uart_puts("\n"); + + /* Unmask CPU IRQs on boards that use IRQ-driven RX (no-op on the + * poll-only boards, where the GEM interrupt is left masked). */ + board_irq_setup(); + +#ifdef DEBUG_GEM + uart_puts("Initial GEM state:\n"); + gem_dump_state(); +#endif + +#ifdef DHCP + if (dhcp_client_init(IPStack) >= 0) { + uint64_t dhcp_start = app_now_ms(); + const uint64_t dhcp_timeout = 15000; +#ifdef DEBUG_GEM + uint64_t dbg_next = dhcp_start + 1000; +#endif + uart_puts("Starting DHCP client...\n"); + while (!dhcp_bound(IPStack) && dhcp_client_is_running(IPStack) + && (app_now_ms() - dhcp_start) < dhcp_timeout) { + (void)wolfIP_poll(IPStack, app_now_ms()); +#ifdef DEBUG_GEM + if (app_now_ms() >= dbg_next) { + dbg_next += 1000; + uart_puts(" ["); uart_putdec((uint32_t)(app_now_ms() - dhcp_start)); + uart_puts(" ms] bound="); + uart_putdec(dhcp_bound(IPStack) ? 1u : 0u); + uart_puts(" running="); + uart_putdec(dhcp_client_is_running(IPStack) ? 1u : 0u); + uart_puts("\n"); + gem_dump_state(); + } +#endif + } + if (dhcp_bound(IPStack)) { + ip4 ip = 0, nm = 0, gw = 0; + wolfIP_ipconfig_get(IPStack, &ip, &nm, &gw); + uart_puts("DHCP bound:\n IP: "); uart_putip4(ip); + uart_puts("\n Mask: "); uart_putip4(nm); + uart_puts("\n GW: "); uart_putip4(gw); + uart_puts("\n"); + } else { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + uart_puts("DHCP timeout - using static IP\n"); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + } + } +#else + { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + uart_puts("Static IP: "); uart_putip4(ip); uart_puts("\n"); + } +#endif + +#ifdef SPEED_TEST + uart_puts("Opening TCP throughput server on port "); + uart_putdec(SPEED_PORT); uart_puts("\n"); + speed_listen_fd = wolfIP_sock_socket(IPStack, AF_INET, + IPSTACK_SOCK_STREAM, 0); + if (speed_listen_fd < 0) { + uart_puts("ERROR: TCP socket alloc failed: "); + uart_puthex((uint32_t)speed_listen_fd); uart_puts("\n"); + while (1) + ; + } + wolfIP_register_callback(IPStack, speed_listen_fd, speed_cb, IPStack); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = ee16(SPEED_PORT); + addr.sin_addr.s_addr = 0; + if (wolfIP_sock_bind(IPStack, speed_listen_fd, + (struct wolfIP_sockaddr *)&addr, sizeof(addr)) < 0) { + uart_puts("ERROR: TCP bind failed\n"); + while (1) + ; + } + if (wolfIP_sock_listen(IPStack, speed_listen_fd, 1) < 0) { + uart_puts("ERROR: TCP listen failed\n"); + while (1) + ; + } + + uart_puts("Ready. RX: dd if=/dev/zero bs=1460 count=N | nc "); + uart_putdec(SPEED_PORT); + uart_puts("\n TX: nc "); uart_putdec(SPEED_PORT); + uart_puts(" /dev/null\n\n"); +#else + uart_puts("Opening UDP echo socket on port "); + uart_putdec(ECHO_PORT); uart_puts("\n"); + udp_fd = wolfIP_sock_socket(IPStack, AF_INET, IPSTACK_SOCK_DGRAM, 0); + if (udp_fd < 0) { + uart_puts("ERROR: UDP socket alloc failed: "); + uart_puthex((uint32_t)udp_fd); uart_puts("\n"); + while (1) + ; + } + wolfIP_register_callback(IPStack, udp_fd, udp_echo_cb, IPStack); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = ee16(ECHO_PORT); + addr.sin_addr.s_addr = 0; + if (wolfIP_sock_bind(IPStack, udp_fd, + (struct wolfIP_sockaddr *)&addr, sizeof(addr)) < 0) { + uart_puts("ERROR: UDP bind failed on port "); + uart_putdec(ECHO_PORT); uart_puts("\n"); + while (1) + ; + } + + uart_puts("Ready. Try: nc -u 7\n\n"); +#endif /* SPEED_TEST */ + + /* Busy-poll with a real-millisecond clock. The previous tick++ + + * delay_ms(1) throttled the stack to ~1 poll/ms (a hard throughput + * ceiling) and fed wolfIP a counter that only approximated real ms. */ +#ifdef SPEED_DEBUG + { + uint64_t last_hb = app_now_ms(); + for (;;) { + uint64_t now = app_now_ms(); + (void)wolfIP_poll(IPStack, now); + if (now - last_hb >= 1000) { + last_hb = now; + uart_putc('P'); + } + } + } +#else + for (;;) { + (void)wolfIP_poll(IPStack, app_now_ms()); + } +#endif + + return 0; +} diff --git a/src/port/amd/common/app.h b/src/port/amd/common/app.h new file mode 100644 index 00000000..964d2e38 --- /dev/null +++ b/src/port/amd/common/app.h @@ -0,0 +1,20 @@ +/* app.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Board hooks consumed by the shared UDP-echo + DHCP demo (app.c). + * Implemented per board in boards//board.c. + */ +#ifndef AMD_APP_H +#define AMD_APP_H + +/* Multi-line intro banner (board name + GIC type) printed at startup. */ +const char *board_banner(void); + +/* Called after the GEM is up: unmask CPU IRQs on boards that use + * IRQ-driven RX; a no-op on poll-only boards. */ +void board_irq_setup(void); + +#endif /* AMD_APP_H */ diff --git a/src/port/amd/common/entropy.c b/src/port/amd/common/entropy.c new file mode 100644 index 00000000..f646c484 --- /dev/null +++ b/src/port/amd/common/entropy.c @@ -0,0 +1,127 @@ +/* entropy.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * MemUse-pattern entropy source for the wolfIP AMD/Xilinx ports. + * + * These PS SoCs do not ship a hardware TRNG usable from bare-metal + * without the platform firmware. This source produces non-deterministic + * 32-bit words from the timing variance of a data-dependent memory-access + * loop over a small state buffer, sampled with a free-running counter + * (arch_counter64(), arch-specific: AArch64 CNTVCT_EL0 / ARMv7 MPCore + * global timer) before and after the walk. It is the same primitive + * wolfCrypt's wc_Entropy_Get() (HAVE_ENTROPY_MEMUSE) uses internally. + * + * This implementation skips wolfCrypt's SHA3-256 conditioning because the + * consumers in wolfIP (TCP ISN, DHCP/DNS transaction IDs, ephemeral source + * ports, IP fragment ID) need unpredictable bits, not crypto-grade + * randomness. For crypto-grade seeding the port should be rebuilt with the + * full wolfCrypt wc_Entropy_Get() in place of amd_get_random32(). + * + * NOTE: the 1 KB state buffer fits within L1 (it is NOT "larger than the + * cache"), so the dominant entropy is the counter granularity and the + * data-dependent walk timing rather than guaranteed cache misses. The + * output is intentionally non-crypto-grade -- see above. + * + * Algorithm per call: + * 1. t0 = arch_counter64() + * 2. Walk state[] performing read+xor+write (data-dependent stride). + * 3. t1 = arch_counter64() + * 4. Fold (t1 - t0) into the rolling 64-bit accumulator and perturb + * state[] so the next call diverges. + * 5. Apply a non-cryptographic finaliser (xorshift) and return the low + * 32 bits. + */ +#include +#include "timer.h" /* arch_counter64() */ + +#define ENTROPY_STATE_WORDS 128u /* 1024 bytes, 16 cache lines */ +#define ENTROPY_WALK_ITERS 256u + +static volatile uint64_t entropy_state[ENTROPY_STATE_WORDS]; +static volatile uint64_t entropy_acc; +static volatile uint32_t entropy_idx; +static volatile int entropy_seeded; + +/* Return a 32-bit value with low predictability, suitable for + * protocol identifiers (DHCP xid, DNS id, TCP ISN, ephemeral port, + * IP fragment id). Not crypto-grade; see file header. */ +uint32_t amd_get_random32(void) +{ + uint64_t t0, t1, delta; + uint64_t acc; + uint32_t i; + uint32_t walk_idx; + uint64_t seed; + uint32_t k; + + /* One-time seed so the earliest outputs (TCP ISN, DHCP xid, ephemeral + * port, ...) do not derive from a single timing delta over all-zero + * state. Fold the hardware counter, this frame's stack address, and a + * per-word counter re-read (each carries a little timing variance) into + * the rolling state. Still non-crypto-grade - see the file header. */ + if (!entropy_seeded) { + seed = arch_counter64() ^ (uint64_t)(uintptr_t)&seed; + for (k = 0; k < ENTROPY_STATE_WORDS; k++) { + seed ^= seed << 13; + seed ^= seed >> 7; + seed ^= seed << 17; + entropy_state[k] ^= seed ^ arch_counter64(); + } + entropy_acc ^= seed; + entropy_seeded = 1; + } + + t0 = arch_counter64(); + + /* Memory-access loop: stride through the state array. Using a + * data-dependent index (acc & mask) keeps the prefetcher from + * predicting cache lines, which is exactly the timing noise we + * want to harvest. */ + acc = entropy_acc; + walk_idx = entropy_idx; + for (i = 0; i < ENTROPY_WALK_ITERS; i++) { + uint32_t pos = (walk_idx + (uint32_t)(acc & 0x7Fu)) + & (ENTROPY_STATE_WORDS - 1u); + uint64_t v = entropy_state[pos]; + v ^= acc; + v = (v << 1) | (v >> 63); /* rotate left 1 */ + entropy_state[pos] = v; + acc += v; + walk_idx++; + } + + t1 = arch_counter64(); + delta = t1 - t0; + + /* Fold the timing delta into the accumulator and the head of + * the state ring. */ + acc ^= delta; + acc ^= (delta << 17) | (delta >> 47); + entropy_state[walk_idx & (ENTROPY_STATE_WORDS - 1u)] ^= acc; + entropy_acc = acc; + entropy_idx = walk_idx; + + /* xorshift64 finaliser to whiten the output word. */ + acc ^= acc << 13; + acc ^= acc >> 7; + acc ^= acc << 17; + + return (uint32_t)acc; +} diff --git a/src/port/amd/common/gem.h b/src/port/amd/common/gem.h new file mode 100644 index 00000000..56b72c35 --- /dev/null +++ b/src/port/amd/common/gem.h @@ -0,0 +1,36 @@ +/* gem.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Public API of the shared Cadence GEM driver for the AMD/Xilinx ports + * (ZynqMP / Versal / Zynq-7000). Single-instance, RGMII, gigabit, polled + * TX. The RX delivery model (IRQ-driven swq vs poll-only) is selected per + * board by the gem_rx_*.c translation unit linked in. + */ +#ifndef AMD_GEM_H +#define AMD_GEM_H + +#include +#include "wolfip.h" + +/* Initialize the GEM, its clock + reset, the PHY, and populate the wolfIP + * link-layer device. Returns: + * < 0 on error (negated TRM code) + * bits [7:0] PHY MDIO address used + * bit [8] link_up flag (1 = link is up at end of init) + */ +int amd_eth_init(struct wolfIP_ll_dev *ll); + +/* MDIO helpers exposed for the PHY drivers. */ +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out); +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value); + +/* Diagnostics: dump GEM registers and counters to UART. */ +void gem_dump_state(void); +uint32_t gem_irq_count(void); +uint32_t gem_rx_frames(void); +uint32_t gem_tx_sent(void); + +#endif /* AMD_GEM_H */ diff --git a/src/port/amd/common/gem_core.c b/src/port/amd/common/gem_core.c new file mode 100644 index 00000000..0db18253 --- /dev/null +++ b/src/port/amd/common/gem_core.c @@ -0,0 +1,436 @@ +/* gem_core.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Shared Cadence GEM core for the AMD/Xilinx ports. Owns the BD rings, + * MDIO, polled TX (eth_send), the init sequence (amd_eth_init) and the + * diagnostics. Everything that diverges per arch/SoC/board is reached + * through hooks declared in gem_port.h: cache maintenance (arch cache.h), + * clock/reset (board_gem.c), PHY dispatch (phy_dispatch_*.c) and the RX + * delivery model (gem_rx_*.c). + */ +#include +#include +#include "config.h" +#include "wolfip.h" +#include "board.h" +#include "uart.h" +#include "gem.h" +#include "gem_regs.h" +#include "gem_port.h" +#include "cache.h" + +/* --------------------------------------------------------------------- + * DMA-visible objects (BD rings, frame buffers, dummy BDs). The linker + * places .dma_buffers in OCM (OCM layout) or a Normal-NC carve-out (DDR + * layout); either way the MMU maps it Normal Non-Cacheable, so the rings + * are inherently DMA-coherent. The cache_clean/cache_inval calls at each + * BD hand-off are then no-ops, but stay correct should a future layout + * map the rings cacheable. + * ------------------------------------------------------------------- */ +struct gem_bd gem_rx_ring[RX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +struct gem_bd gem_tx_ring[TX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +uint8_t gem_rx_buf_pool[RX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +uint8_t gem_tx_buf_pool[TX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); + +/* Dummy BD pair for disabling priority queues 1-3. The TX dummy has + * USED=1 so the MAC ignores it; the RX dummy has the SW-OWN bit set so + * the MAC won't write into queue 1-3 RX. */ +static struct gem_bd dummy_tx_bd + __attribute__((aligned(8), section(".dma_buffers"))); +static struct gem_bd dummy_rx_bd + __attribute__((aligned(8), section(".dma_buffers"))); + +volatile uint32_t gem_drops; +volatile uint32_t gem_irqs; +volatile uint32_t gem_rxframes; +volatile uint32_t gem_txsent; + +uint32_t gem_rx_next; /* next BD the SW will look at */ +uint32_t gem_tx_next; /* next BD the SW will try to TX */ + +uint8_t gem_phy_addr; + +/* --------------------------------------------------------------------- + * MDIO + * ------------------------------------------------------------------- */ +static int mdio_wait_idle(void) +{ + int spin; + for (spin = 0; spin < 100000; spin++) { + if (GEM_NWSR & NWSR_PHY_IDLE) + return 0; + } + return -1; +} + +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_R + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18); + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + *out = (uint16_t)(GEM_PHYMNTNC & 0xFFFFu); + return 0; +} + +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_W + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18) + | (uint32_t)value; + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + return 0; +} + +/* --------------------------------------------------------------------- + * BD ring init + * ------------------------------------------------------------------- */ +static void rx_ring_init(void) +{ + uint32_t i; + for (i = 0; i < RX_RING_LEN; i++) { + uint32_t addr = (uint32_t)(uintptr_t)gem_rx_buf_pool[i]; + addr &= RXBUF_ADDR_MASK; + if (i == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + gem_rx_ring[i].addr = addr; /* OWN=0 -> hardware can use */ + gem_rx_ring[i].status = 0; + } + gem_rx_next = 0; + /* Clean the ring to the PoC so the MAC reads the BDs we just wrote + * rather than stale memory (no-op when the ring is non-cacheable). */ + cache_clean(gem_rx_ring, sizeof(gem_rx_ring)); +} + +static void tx_ring_init(void) +{ + uint32_t i; + /* Match u-boot zynq_gem: all BDs start as dummies with USED|LAST|WRAP, + * addr=0. eth_send fills addr + length + LAST (and clears USED) when + * actually transmitting. The WRAP bit on the last BD keeps the MAC + * walker in our ring. */ + for (i = 0; i < TX_RING_LEN; i++) { + gem_tx_ring[i].addr = 0; + gem_tx_ring[i].status = TXBUF_USED | TXBUF_LAST + | ((i == TX_RING_LEN - 1) ? TXBUF_WRAP : 0); + } + gem_tx_next = 0; + cache_clean(gem_tx_ring, sizeof(gem_tx_ring)); +} + +/* --------------------------------------------------------------------- + * eth_send (TX path; shared by all RX models) + * ------------------------------------------------------------------- */ +static int eth_send(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t idx; + uint32_t status; + + (void)ll; + + if (len > BUF_LEN) + return -WOLFIP_EINVAL; /* frame larger than a BD buffer */ + + idx = gem_tx_next; + /* Wait briefly for the BD to be free (USED=1 means MAC done). The + * USED bit is written back by MAC DMA - invalidate the cache line so + * the CPU does not see the stale USED=0 we wrote when we last armed + * this BD. */ + { + int spin; + for (spin = 0; spin < 100000; spin++) { + cache_inval(&gem_tx_ring[idx], sizeof(gem_tx_ring[idx])); + if (gem_tx_ring[idx].status & TXBUF_USED) + break; + } + if ((gem_tx_ring[idx].status & TXBUF_USED) == 0) + return -WOLFIP_EAGAIN; /* TX ring backed up - core retries */ + } + + memcpy(gem_tx_buf_pool[idx], buf, len); + + /* Pad to minimum Ethernet frame (60 bytes; MAC adds 4-byte FCS). */ + if (len < 60u) { + memset(gem_tx_buf_pool[idx] + len, 0, 60u - len); + len = 60u; + } + + /* Flush the frame buffer from D-cache so MAC DMA reads see it. */ + cache_clean(gem_tx_buf_pool[idx], len); + + /* Re-arm BD: set buffer address, then clear USED with length+LAST + * (preserve WRAP if this is the last BD). Buffer addr written before + * status so MAC walking the ring sees a valid pair. */ + gem_tx_ring[idx].addr = (uint32_t)(uintptr_t)gem_tx_buf_pool[idx]; + status = (len & TXBUF_LEN_MASK) | TXBUF_LAST; + if (idx == TX_RING_LEN - 1) + status |= TXBUF_WRAP; + gem_tx_ring[idx].status = status; /* USED=0 -> ready for MAC */ + + cache_clean(&gem_tx_ring[idx], sizeof(gem_tx_ring[idx])); + GEM_NWCTRL |= NWCTRL_STARTTX; + + gem_txsent++; + gem_tx_next = (idx + 1) % TX_RING_LEN; + return (int)len; +} + +/* --------------------------------------------------------------------- + * Diagnostics + * ------------------------------------------------------------------- */ +uint32_t gem_irq_count(void) { return gem_irqs; } +uint32_t gem_rx_frames(void) { return gem_rxframes; } +uint32_t gem_tx_sent(void) { return gem_txsent; } + +void gem_dump_state(void) +{ + uint32_t i; + uint32_t filled; + uint32_t first_filled; + cache_inval(gem_rx_ring, sizeof(gem_rx_ring)); + cache_inval(gem_tx_ring, sizeof(gem_tx_ring)); + uart_puts("GEM regs: NWCTRL="); uart_puthex(GEM_NWCTRL); + uart_puts(" NWCFG="); uart_puthex(GEM_NWCFG); + uart_puts(" NWSR="); uart_puthex(GEM_NWSR); + uart_puts(" DMACR="); uart_puthex(GEM_DMACR); + uart_puts("\n ISR="); uart_puthex(GEM_ISR); + uart_puts(" RSR="); uart_puthex(GEM_RSR); + uart_puts(" TSR="); uart_puthex(GEM_TSR); + uart_puts(" IMR="); uart_puthex(GEM_IMR); + uart_puts("\n tx[0]="); uart_puthex(gem_tx_ring[0].addr); + uart_puts("/"); uart_puthex(gem_tx_ring[0].status); + uart_puts(" rx[0]="); uart_puthex(gem_rx_ring[0].addr); + uart_puts("/"); uart_puthex(gem_rx_ring[0].status); + uart_puts("\n irq="); uart_putdec(gem_irqs); + uart_puts(" rx_frm="); uart_putdec(gem_rxframes); + uart_puts(" tx_snt="); uart_putdec(gem_txsent); + uart_puts(" drops="); uart_putdec(gem_drops); + uart_puts("\n HW counters: txoct="); uart_putdec(GEM_OCTTXL); + uart_puts(" txcnt="); uart_putdec(GEM_TXCNT); + uart_puts(" rxoct="); uart_putdec(GEM_OCTRXL); + uart_puts(" rxcnt="); uart_putdec(GEM_RXCNT); + uart_puts(" rxfcs="); uart_putdec(GEM_RXFCSCNT); + uart_puts(" rxor="); uart_putdec(GEM_RXORCNT); + filled = 0; + first_filled = 0xFFFF; + for (i = 0; i < RX_RING_LEN; i++) { + if (gem_rx_ring[i].addr & RXBUF_OWN_SW) { + filled++; + if (first_filled == 0xFFFF) first_filled = i; + } + } + uart_puts(" rx_filled="); uart_putdec(filled); + uart_puts(" first="); uart_putdec(first_filled); + uart_puts(" gem_rx_next="); uart_putdec(gem_rx_next); + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Public init + * ------------------------------------------------------------------- */ +int amd_eth_init(struct wolfIP_ll_dev *ll) +{ + uint8_t addr; + uint16_t id1; + int found_phy; + int speed; + int fd; + int link_up; + uint32_t k; + + /* SoC-specific prerequisites (e.g. Zynq-7000 PL310 L2 disable). */ + gem_soc_pre_init(); + + /* Bring the MAC to a known state on top of the platform clock. */ + gem_clk_reset(); + + /* Disable everything before configuring. */ + GEM_NWCTRL = 0; + GEM_IDR = 0xFFFFFFFFu; + (void)GEM_ISR; + GEM_ISR = 0xFFFFFFFFu; + GEM_TSR = 0xFFFFFFFFu; + GEM_RSR = RSR_BUFFNA | RSR_FRAMERX | RSR_RXOVR | RSR_HRESPNOK; + + /* Initial NWCFG: gigabit, full duplex, MDC=/96, 1536-byte frames, + * strip FCS from RX, accept broadcasts, multicast via hash. Address + * filtering is left on (no promiscuous COPYALL): broadcast (DHCP/ARP) + * and our unicast / multicast-hash frames are received, which is all + * the stack needs and avoids DMA'ing every frame on a busy LAN. */ + GEM_NWCFG = NWCFG_1000 + | NWCFG_FDEN + | NWCFG_FCSREM + | NWCFG_1536RXEN + | NWCFG_MCASTHASHEN + | (5u << NWCFG_MDCDIV_SHIFT); +#ifdef XILINX_AARCH64 + /* 64-bit AMBA data width: appropriate on the AArch64 SoCs (ZynqMP / + * Versal). The Zynq-7000 GEM is fed by a 32-bit AXI master, where this + * bit is inert, so it is left clear there. */ + GEM_NWCFG |= NWCFG_DWIDTH_64; +#endif + + /* DMACR: AHB fixed burst 16 beats, RX buffer 1536/64=24, TX/RX packet + * buffer memory at max. Do NOT set bit 30 (DMA_ADDR_BUS_WIDTH 64-bit): + * that selects 16-byte BD format with addr_hi and would break the + * 8-byte struct gem_bd layout. */ + GEM_DMACR = (24u << 16) /* RX buffer size in 64-byte units */ + | (1u << 10) /* TX packet buffer memory size = max */ + | (3u << 8) /* RX packet buffer memory size = max */ + | 0x10u; /* burst length = 16 */ + + /* Set MAC address into SAB1/SAT1. SAB1L latches on SAB1H write per + * TRM, so write the high half last. */ + GEM_LADDR1L = (uint32_t)WOLFIP_MAC_0 + | ((uint32_t)WOLFIP_MAC_1 << 8) + | ((uint32_t)WOLFIP_MAC_2 << 16) + | ((uint32_t)WOLFIP_MAC_3 << 24); + GEM_LADDR1H = (uint32_t)WOLFIP_MAC_4 + | ((uint32_t)WOLFIP_MAC_5 << 8); + + GEM_HASHL = 0; + GEM_HASHH = 0; + + /* Build BD rings. */ + rx_ring_init(); + tx_ring_init(); + GEM_RXQBASE = (uint32_t)(uintptr_t)gem_rx_ring; + GEM_TXQBASE = (uint32_t)(uintptr_t)gem_tx_ring; + + /* Disable priority queues 1-3 with dummy BDs (else the MAC may walk + * uninitialised q1/q2/q3 base pointers and hang). */ + dummy_tx_bd.addr = 0; + dummy_tx_bd.status = TXBUF_USED | TXBUF_WRAP | TXBUF_LAST; + dummy_rx_bd.addr = RXBUF_WRAP | RXBUF_OWN_SW; + dummy_rx_bd.status = 0; + GEM_TXQ1BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ2BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ3BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_RXQ1BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ2BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ3BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + cache_clean(&dummy_tx_bd, sizeof(dummy_tx_bd)); + cache_clean(&dummy_rx_bd, sizeof(dummy_rx_bd)); + + /* Clear any stale RX/TX packet classification screening (default 0 = + * everything to Q0). */ + for (k = 0; k < 16; k++) { + GEM_SCREEN_T1(k) = 0; + GEM_SCREEN_T2(k) = 0; + } + + /* Enable MDIO so we can talk to the PHY. */ + GEM_NWCTRL |= NWCTRL_MDEN; + + /* Scan all 32 MDIO addresses, reporting each responsive PHY's ID and + * link status (BMSR reg 1, bit 2). A board may present more than one + * PHY on the bus; prefer one that already has copper link so we + * configure the PHY wired to the on-board RJ45 rather than the first + * responder. */ + found_phy = 0; + gem_phy_addr = 0; + { + uint16_t bmsr; + for (addr = 0; addr < 32; addr++) { + if (gem_mdio_read(addr, 0x02, &id1) != 0 || id1 == 0xFFFFu || id1 == 0) + continue; + bmsr = 0; + (void)gem_mdio_read(addr, 0x01, &bmsr); + uart_puts("MDIO scan: addr="); uart_puthex(addr); + uart_puts(" id1="); uart_puthex(id1); + uart_puts(" bmsr="); uart_puthex(bmsr); + uart_puts((bmsr & 0x0004u) ? " LINK\n" : "\n"); + if (!found_phy || (bmsr & 0x0004u)) { + found_phy = 1; + gem_phy_addr = addr; + if (bmsr & 0x0004u) + break; /* linked PHY wins */ + } + } + if (!found_phy) { + uart_puts("GEM: no PHY responding on MDIO!\n"); + return -10; + } + } + /* Re-read id1 for the selected PHY so the vendor dispatch is correct + * even when the scan broke early on a linked PHY. */ + (void)gem_mdio_read(gem_phy_addr, 0x02, &id1); + uart_puts("GEM: PHY at MDIO addr="); + uart_puthex(gem_phy_addr); + uart_puts("\n"); + + if (gem_phy_init(gem_phy_addr, id1, &speed, &fd) < 0) { + uart_puts("GEM: PHY init failed\n"); + return -11; + } + + /* If PHY ended up at 10/100, downshift the MAC and re-program the + * RGMII reference clock to match (no-op on SoCs where the platform + * firmware owns the GEM clock). */ + if (speed != 1000) { + uint32_t cfg = GEM_NWCFG; + cfg &= ~NWCFG_1000; + if (speed == 100) + cfg |= NWCFG_SPEED100; + else + cfg &= ~NWCFG_SPEED100; + if (!fd) + cfg &= ~NWCFG_FDEN; + GEM_NWCFG = cfg; + gem_set_ref_clk(speed); + } + + /* Arm the RX delivery model (install IRQ handler, or leave masked for + * poll-only ports) and enable RX/TX. */ + gem_rx_install(); + GEM_NWCTRL |= NWCTRL_RXEN | NWCTRL_TXEN; + + /* Populate wolfIP ll_dev. */ + ll->mac[0] = WOLFIP_MAC_0; + ll->mac[1] = WOLFIP_MAC_1; + ll->mac[2] = WOLFIP_MAC_2; + ll->mac[3] = WOLFIP_MAC_3; + ll->mac[4] = WOLFIP_MAC_4; + ll->mac[5] = WOLFIP_MAC_5; + memcpy(ll->ifname, "eth0", 5); + ll->non_ethernet = 0; + ll->mtu = LINK_MTU; + ll->poll = gem_eth_poll; + ll->send = eth_send; + ll->priv = NULL; + + link_up = (gem_phy_link_status(gem_phy_addr) == 1) ? 1 : 0; + return (link_up << 8) | (int)gem_phy_addr; +} diff --git a/src/port/amd/common/gem_port.h b/src/port/amd/common/gem_port.h new file mode 100644 index 00000000..a4314350 --- /dev/null +++ b/src/port/amd/common/gem_port.h @@ -0,0 +1,54 @@ +/* gem_port.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Internal interface between the shared GEM core (gem_core.c) and the + * per-port / per-arch pieces it is composed with at build time: + * - cache maintenance arch//cache.h (cache_clean/cache_inval) + * - SoC clock/reset quirks boards//board_gem.c + * - PHY vendor dispatch ip/phy_dispatch_*.c + * - RX delivery model ip/gem_rx_*.c (+ ip/gem_swq.c for the swq + * variants) + * Not part of the public API (that is gem.h). + */ +#ifndef AMD_GEM_PORT_H +#define AMD_GEM_PORT_H + +#include +#include "wolfip.h" +#include "gem_regs.h" + +/* --- State owned by gem_core.c, shared with the RX-model TU --- */ +extern struct gem_bd gem_rx_ring[RX_RING_LEN]; +extern struct gem_bd gem_tx_ring[TX_RING_LEN]; +extern uint8_t gem_rx_buf_pool[RX_RING_LEN][BUF_LEN]; +extern uint8_t gem_tx_buf_pool[TX_RING_LEN][BUF_LEN]; +extern uint32_t gem_rx_next; +extern uint32_t gem_tx_next; +extern volatile uint32_t gem_irqs; +extern volatile uint32_t gem_rxframes; +extern volatile uint32_t gem_txsent; +extern volatile uint32_t gem_drops; +extern uint8_t gem_phy_addr; + +/* --- SoC clock/reset hooks (boards//board_gem.c) --- */ +void gem_soc_pre_init(void); /* SoC quirks before MAC config */ +void gem_clk_reset(void); /* pulse MAC reset / base clock */ +void gem_set_ref_clk(int speed_mbps); /* RGMII TX clock for link speed */ + +/* --- PHY vendor dispatch (ip/phy_dispatch_*.c). id1 = MII reg 2 from + * the MDIO scan; sets *speed (10/100/1000) and *fd; returns 0 / <0. --- */ +int gem_phy_init(uint8_t phy_addr, uint16_t id1, int *speed, int *full_duplex); +int gem_phy_link_status(uint8_t phy_addr); + +/* --- RX delivery model (ip/gem_rx_*.c) --- */ +void gem_rx_install(void); /* arm the RX path (install IRQ, or mask) */ +int gem_eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len); /* ll->poll */ + +/* --- SW RX queue helpers (ip/gem_swq.c; swq RX models only) --- */ +void gem_isr(void); /* fill swq from gem_rx_ring */ +int gem_swq_drain(void *buf, uint32_t len); /* consume one swq slot */ + +#endif /* AMD_GEM_PORT_H */ diff --git a/src/port/amd/common/gem_regs.h b/src/port/amd/common/gem_regs.h new file mode 100644 index 00000000..a7e71d88 --- /dev/null +++ b/src/port/amd/common/gem_regs.h @@ -0,0 +1,124 @@ +/* gem_regs.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Cadence GEM register map, bit masks, buffer-descriptor layout and ring + * sizing, shared by the AMD/Xilinx GEM core and its per-port hooks. The + * register block base is GEM_BASE (each board.h selects the on-board GEM + * instance, e.g. ZynqMP GEM3, Versal/Zynq-7000 GEM0). + */ +#ifndef AMD_GEM_REGS_H +#define AMD_GEM_REGS_H + +#include +#include "board.h" /* GEM_BASE */ + +/* --------------------------------------------------------------------- + * Register accessors (subset we use) + * ------------------------------------------------------------------- */ +#define GEM_NWCTRL (*(volatile uint32_t *)(GEM_BASE + 0x000)) +#define GEM_NWCFG (*(volatile uint32_t *)(GEM_BASE + 0x004)) +#define GEM_NWSR (*(volatile uint32_t *)(GEM_BASE + 0x008)) +#define GEM_DMACR (*(volatile uint32_t *)(GEM_BASE + 0x010)) +#define GEM_TSR (*(volatile uint32_t *)(GEM_BASE + 0x014)) +#define GEM_RXQBASE (*(volatile uint32_t *)(GEM_BASE + 0x018)) +#define GEM_TXQBASE (*(volatile uint32_t *)(GEM_BASE + 0x01C)) +#define GEM_RSR (*(volatile uint32_t *)(GEM_BASE + 0x020)) +#define GEM_ISR (*(volatile uint32_t *)(GEM_BASE + 0x024)) +#define GEM_IER (*(volatile uint32_t *)(GEM_BASE + 0x028)) +#define GEM_IDR (*(volatile uint32_t *)(GEM_BASE + 0x02C)) +#define GEM_IMR (*(volatile uint32_t *)(GEM_BASE + 0x030)) +#define GEM_PHYMNTNC (*(volatile uint32_t *)(GEM_BASE + 0x034)) +#define GEM_HASHL (*(volatile uint32_t *)(GEM_BASE + 0x080)) +#define GEM_HASHH (*(volatile uint32_t *)(GEM_BASE + 0x084)) +#define GEM_LADDR1L (*(volatile uint32_t *)(GEM_BASE + 0x088)) +#define GEM_LADDR1H (*(volatile uint32_t *)(GEM_BASE + 0x08C)) +/* Priority queue base addresses (queues 1-3). Cadence GEM has 4 TX and + * 4 RX priority queues; if we don't point unused ones at a safe dummy + * BD, the MAC will eventually try to fetch from queue1+ at power-on- + * random addresses and hang (TSR.TXGO sticks with no octets sent). + * U-Boot's zynq_gem and Linux's macb both set these. */ +#define GEM_TXQ1BASE (*(volatile uint32_t *)(GEM_BASE + 0x440)) +#define GEM_TXQ2BASE (*(volatile uint32_t *)(GEM_BASE + 0x444)) +#define GEM_TXQ3BASE (*(volatile uint32_t *)(GEM_BASE + 0x448)) +#define GEM_RXQ1BASE (*(volatile uint32_t *)(GEM_BASE + 0x480)) +#define GEM_RXQ2BASE (*(volatile uint32_t *)(GEM_BASE + 0x484)) +#define GEM_RXQ3BASE (*(volatile uint32_t *)(GEM_BASE + 0x488)) +#define GEM_OCTTXL (*(volatile uint32_t *)(GEM_BASE + 0x100)) +#define GEM_TXCNT (*(volatile uint32_t *)(GEM_BASE + 0x108)) +#define GEM_OCTRXL (*(volatile uint32_t *)(GEM_BASE + 0x150)) +#define GEM_RXCNT (*(volatile uint32_t *)(GEM_BASE + 0x158)) +#define GEM_RXFCSCNT (*(volatile uint32_t *)(GEM_BASE + 0x190)) +#define GEM_RXORCNT (*(volatile uint32_t *)(GEM_BASE + 0x1A4)) +/* Packet-classification screening registers (cleared at init). */ +#define GEM_SCREEN_T1(k) (*(volatile uint32_t *)(GEM_BASE + 0x500 + 4*(k))) +#define GEM_SCREEN_T2(k) (*(volatile uint32_t *)(GEM_BASE + 0x540 + 4*(k))) + +#define NWCTRL_LOOPEN (1u << 1) +#define NWCTRL_RXEN (1u << 2) +#define NWCTRL_TXEN (1u << 3) +#define NWCTRL_MDEN (1u << 4) +#define NWCTRL_STATCLR (1u << 5) +#define NWCTRL_STARTTX (1u << 9) +#define NWCTRL_HALTTX (1u << 10) + +#define NWCFG_SPEED100 (1u << 0) +#define NWCFG_FDEN (1u << 1) +#define NWCFG_COPYALL (1u << 4) +#define NWCFG_BCASTDI (1u << 5) +#define NWCFG_MCASTHASHEN (1u << 6) +#define NWCFG_UCASTHASHEN (1u << 7) +#define NWCFG_1536RXEN (1u << 8) +#define NWCFG_1000 (1u << 10) +#define NWCFG_FCSREM (1u << 17) +#define NWCFG_MDCDIV_SHIFT 18u +#define NWCFG_MDCDIV_MASK (7u << 18) +#define NWCFG_DWIDTH_64 (1u << 21) /* Data bus width = 64 bit (AArch64) */ + +#define NWSR_PHY_IDLE (1u << 2) + +#define RSR_BUFFNA (1u << 0) +#define RSR_FRAMERX (1u << 1) +#define RSR_RXOVR (1u << 2) +#define RSR_HRESPNOK (1u << 3) + +#define IXR_MGMNT (1u << 0) +#define IXR_FRAMERX (1u << 1) +#define IXR_TXCOMPL (1u << 7) +#define IXR_TXEXH (1u << 6) +#define IXR_RXUSED (1u << 2) +#define IXR_RXOVR (1u << 10) +#define IXR_HRESPNOK (1u << 11) + +#define PHYMNTNC_CLAUSE22 0x40020000u +#define PHYMNTNC_OP_R (2u << 28) +#define PHYMNTNC_OP_W (1u << 28) + +#define RXBUF_OWN_SW (1u << 0) +#define RXBUF_WRAP (1u << 1) +#define RXBUF_ADDR_MASK 0xFFFFFFFCu +#define RXBUF_LEN_MASK 0x00001FFFu + +#define TXBUF_USED (1u << 31) +#define TXBUF_WRAP (1u << 30) +#define TXBUF_LAST (1u << 15) +#define TXBUF_LEN_MASK 0x00003FFFu + +/* GEM BD: two 32-bit words. */ +struct gem_bd { + uint32_t addr; + uint32_t status; +}; + +/* Ring depth is kept small to fit text + DMA buffers + BSS (including the + * per-socket TCP rx/tx windows) in the 256 KB OCM layout: bumping RX/TX to + * 32/16 overflows OCM by ~12 KB. For a single busy-polled TCP stream the + * shallow rings are not the bottleneck (the loop drains them every poll); + * deeper rings for multi-stream / burst RX are a DDR-layout future lever. */ +#define RX_RING_LEN 16 +#define TX_RING_LEN 8 +#define BUF_LEN 1536 /* multiple of 64, per DMACR.RXBS */ + +#endif /* AMD_GEM_REGS_H */ diff --git a/src/port/amd/common/gic.h b/src/port/amd/common/gic.h new file mode 100644 index 00000000..14fde911 --- /dev/null +++ b/src/port/amd/common/gic.h @@ -0,0 +1,49 @@ +/* gic.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef AMD_GIC_H +#define AMD_GIC_H + +#include + +typedef void (*gic_handler_t)(void); + +void gic_init(void); +void gic_register_handler(uint32_t intid, gic_handler_t fn); +void gic_enable_spi(uint32_t intid, uint32_t priority); +void gic_disable_spi(uint32_t intid); + +/* Returns 1 if interrupt is currently pending at the distributor, + * 0 otherwise. Diagnostic only. */ +uint32_t gic_is_pending(uint32_t intid); + +/* Fire a software-generated interrupt to self (CPU0) for testing. + * intid must be < 16. */ +void gic_self_test_sgi(uint32_t intid); + +/* Total IRQs taken (any intid) and the last intid we saw. */ +uint32_t gic_total_irqs(void); +uint32_t gic_last_intid(void); + +/* Polled-mode IRQ dispatch: drains any pending IRQ from the GIC + * by reading GICC_IAR, calling the registered handler, and EOI'ing. + * Returns the number of interrupts dispatched in this call. + * + * Workaround: on some of these Cortex-A / GIC combinations the GIC + * latches pending interrupts correctly but the CPU never takes the + * IRQ exception (root cause not pinned). Calling this function from + * the main loop is functionally equivalent. */ +uint32_t gic_poll_dispatch(void); + +/* Provided by startup.S, asm helpers. */ +void irq_enable(void); +void irq_disable(void); + +/* Called by the IRQ vector trampoline in startup.S. Acknowledges, + * dispatches, and EOIs the current interrupt. */ +void irq_dispatch(void); + +#endif /* AMD_GIC_H */ diff --git a/src/port/amd/common/mmu.h b/src/port/amd/common/mmu.h new file mode 100644 index 00000000..fdd11263 --- /dev/null +++ b/src/port/amd/common/mmu.h @@ -0,0 +1,12 @@ +/* mmu.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef AMD_MMU_H +#define AMD_MMU_H + +void mmu_enable(void); + +#endif /* AMD_MMU_H */ diff --git a/src/port/amd/common/uart.h b/src/port/amd/common/uart.h new file mode 100644 index 00000000..e622422f --- /dev/null +++ b/src/port/amd/common/uart.h @@ -0,0 +1,20 @@ +/* uart.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef AMD_UART_H +#define AMD_UART_H + +#include +#include "wolfip.h" /* for ip4 */ + +void uart_init(void); +void uart_putc(char c); +void uart_puts(const char *s); +void uart_puthex(uint32_t val); +void uart_putdec(uint32_t val); +void uart_putip4(ip4 ip); + +#endif /* AMD_UART_H */ diff --git a/src/port/amd/common/uart_util.c b/src/port/amd/common/uart_util.c new file mode 100644 index 00000000..068c6f78 --- /dev/null +++ b/src/port/amd/common/uart_util.c @@ -0,0 +1,57 @@ +/* uart_util.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Hardware-independent UART formatting helpers shared by the UART drivers + * (uart_cadence.c / uart_pl011.c). Each driver provides the hardware- + * specific uart_init() and uart_putc(); these build on uart_putc(). + */ +#include +#include "uart.h" + +void uart_puts(const char *s) +{ + while (*s) { + if (*s == '\n') + uart_putc('\r'); + uart_putc(*s++); + } +} + +void uart_puthex(uint32_t val) +{ + static const char hex[] = "0123456789ABCDEF"; + int i; + uart_puts("0x"); + for (i = 28; i >= 0; i -= 4) + uart_putc(hex[(val >> i) & 0xF]); +} + +void uart_putdec(uint32_t val) +{ + char buf[11]; + int i = 0; + if (val == 0) { + uart_putc('0'); + return; + } + while (val > 0 && i < (int)sizeof(buf)) { + buf[i++] = '0' + (char)(val % 10); + val /= 10; + } + while (i > 0) + uart_putc(buf[--i]); +} + +void uart_putip4(ip4 ip) +{ + uart_putdec((ip >> 24) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 16) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 8) & 0xFF); + uart_putc('.'); + uart_putdec(ip & 0xFF); +} diff --git a/src/port/amd/common/wolfip_config.h b/src/port/amd/common/wolfip_config.h new file mode 100644 index 00000000..7fbf0a84 --- /dev/null +++ b/src/port/amd/common/wolfip_config.h @@ -0,0 +1,100 @@ +/* wolfip_config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Shared wolfIP profile for the AMD/Xilinx bare-metal ports. The three + * boards use an identical UDP-only-intent profile, so it lives here and each + * board's config.h is a thin shim that includes it. A board can still + * override any value by #defining it before the include (the #ifndef-guarded + * entries) or by editing its own config.h. + */ +#ifndef AMD_WOLFIP_CONFIG_H +#define AMD_WOLFIP_CONFIG_H + +#ifndef CONFIG_IPFILTER +#define CONFIG_IPFILTER 0 +#endif + +#define ETHERNET +#define LINK_MTU 1536 + +/* UDP-only profile in intent: the application does not call + * wolfIP_sock_socket() with IPSTACK_SOCK_STREAM. MAX_TCPSOCKETS is set + * to a small non-zero value only because core wolfIP currently sizes + * its timer heap via MAX_TIMERS = MAX_TCPSOCKETS * 3, and DHCP / ARP + * aging need timers. With MAX_TCPSOCKETS=0 the timer-heap insert path + * is permanently full and DHCP cannot schedule its retransmit timer. + * A core wolfIP follow-up should decouple MAX_TIMERS from + * MAX_TCPSOCKETS so DAL-C builds can truly opt TCP code out at + * compile time. */ +#ifdef SPEED_TEST +/* TCP throughput build (port 9 sink/source). Bigger per-socket rx/tx + * buffers raise the TCP window (the rx free space the board advertises and + * the tx data it can keep in flight); the UDP socket count is trimmed since + * only DHCP needs UDP here. Every socket embeds rxmem[RXBUF_SIZE] + + * txmem[TXBUF_SIZE], so the totals are kept within the 256 KB OCM budget + * (the linker gates any overflow). */ +#define MAX_TCPSOCKETS 2 /* listen + one client (passive close, + * so no board-side TIME_WAIT slot) */ +#define MAX_UDPSOCKETS 2 /* DHCP only */ +#define MAX_ICMPSOCKETS 1 +#define RXBUF_SIZE (LINK_MTU * 6) +#define TXBUF_SIZE (LINK_MTU * 6) +#else +#define MAX_TCPSOCKETS 2 +#define MAX_UDPSOCKETS 4 +#define MAX_ICMPSOCKETS 1 +#define RXBUF_SIZE (LINK_MTU * 4) +#define TXBUF_SIZE (LINK_MTU * 4) +#endif + +#define MAX_NEIGHBORS 16 + +#ifndef WOLFIP_MAX_INTERFACES +#define WOLFIP_MAX_INTERFACES 1 +#endif + +#ifndef WOLFIP_ENABLE_FORWARDING +#define WOLFIP_ENABLE_FORWARDING 0 +#endif + +#ifndef WOLFIP_ENABLE_LOOPBACK +#define WOLFIP_ENABLE_LOOPBACK 0 +#endif + +#ifndef WOLFIP_ENABLE_DHCP +#define WOLFIP_ENABLE_DHCP 1 +#endif + +/* Static IP fallback (used if DHCP is disabled or times out). */ +#define WOLFIP_IP "192.168.1.100" +#define WOLFIP_NETMASK "255.255.255.0" +#define WOLFIP_GW "192.168.1.1" +#define WOLFIP_STATIC_DNS_IP "8.8.8.8" + +#if WOLFIP_ENABLE_DHCP +#define DHCP +#define DHCP_DISCOVER_RETRIES 2 +#define DHCP_REQUEST_RETRIES 2 +#endif + +/* Hardware debug: define for verbose GEM / MDIO / DHCP logging. */ +/* #define DEBUG_HW */ + +#endif /* AMD_WOLFIP_CONFIG_H */ diff --git a/src/port/amd/ip/gem_rx_irq.c b/src/port/amd/ip/gem_rx_irq.c new file mode 100644 index 00000000..7ab06cb9 --- /dev/null +++ b/src/port/amd/ip/gem_rx_irq.c @@ -0,0 +1,38 @@ +/* gem_rx_irq.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * IRQ-driven RX policy: gem_isr() runs off the real GIC SPI and fills swq[]; + * gem_eth_poll() only drains the SW queue. The swq machinery and + * gem_isr/gem_swq_drain live in gem_swq.c. + * + * NOTE: this file is intentionally NOT compiled by any shipped board Makefile + * (it is therefore not covered by -Wall -Wextra -Werror; keep it in sync with + * the gem_port.h / IXR_* / gic_* interfaces by hand). All three boards select + * the poll-driven RX models instead (ip/gem_rx_swq_poll.c on ZCU102/Versal, + * ip/gem_rx_poll.c on Zynq-7000): an enabled RX-complete interrupt storms the + * CPU under sustained (TCP-rate) RX and wedges the stack. This file is retained + * as the reference IRQ wiring for a future NAPI-style model (mask the GEM RX + * IRQ in the ISR, re-enable after the main loop drains). + */ +#include "gem_regs.h" +#include "gem_port.h" +#include "board.h" +#include "gic.h" + +int gem_eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + (void)ll; + return gem_swq_drain(buf, len); +} + +/* Register the GEM ISR, enable its SPI at the GIC distributor, and arm the + * RX-side GEM interrupts. gem_isr is the single (IRQ-context) producer. */ +void gem_rx_install(void) +{ + gic_register_handler(IRQ_GEM, gem_isr); + gic_enable_spi(IRQ_GEM, 0xA0); + GEM_IER = IXR_FRAMERX | IXR_RXUSED | IXR_RXOVR | IXR_HRESPNOK; +} diff --git a/src/port/amd/ip/gem_rx_poll.c b/src/port/amd/ip/gem_rx_poll.c new file mode 100644 index 00000000..a83ca056 --- /dev/null +++ b/src/port/amd/ip/gem_rx_poll.c @@ -0,0 +1,66 @@ +/* gem_rx_poll.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Poll-only RX policy (Zynq-7000): gem_eth_poll() walks the hardware BD ring + * directly; there is no SW queue and no GEM ISR. On the Cortex-A9 the GIC + * delivers the GEM SPI and an enabled RX-complete IRQ storms the CPU, so + * the GEM interrupt is left masked (GEM_IDR set in amd_eth_init) and + * gem_rx_install() is a no-op. + */ +#include +#include "gem_regs.h" +#include "gem_port.h" +#include "cache.h" + +int gem_eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t status; + uint32_t frame_len; + uint32_t copy; + uint32_t addr; + + (void)ll; + + cache_inval(gem_rx_ring, sizeof(gem_rx_ring)); + if (!(gem_rx_ring[gem_rx_next].addr & RXBUF_OWN_SW)) { + /* No frame. If the MAC hit "buffer not available" while the ring + * was momentarily full, clear it so it re-walks the recycled ring + * rather than wedging the RX path. */ + if (GEM_RSR & RSR_BUFFNA) + GEM_RSR = RSR_BUFFNA; + return 0; + } + + status = gem_rx_ring[gem_rx_next].status; + frame_len = status & RXBUF_LEN_MASK; + cache_inval(gem_rx_buf_pool[gem_rx_next], frame_len); + copy = frame_len; + if (copy > len) + copy = len; + memcpy(buf, gem_rx_buf_pool[gem_rx_next], copy); + gem_rxframes++; + + /* Recycle the BD: clear status, then rewrite addr with OWN=0 (WRAP on + * the last BD) and push it to memory so the MAC reuses the slot. */ + addr = (uint32_t)(uintptr_t)gem_rx_buf_pool[gem_rx_next]; + addr &= RXBUF_ADDR_MASK; + if (gem_rx_next == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + gem_rx_ring[gem_rx_next].status = 0; + __asm__ volatile ("dsb" ::: "memory"); + gem_rx_ring[gem_rx_next].addr = addr; /* OWN=0 -> hardware can write */ + cache_clean(&gem_rx_ring[gem_rx_next], sizeof(gem_rx_ring[gem_rx_next])); + __asm__ volatile ("dsb" ::: "memory"); + + gem_rx_next = (gem_rx_next + 1) % RX_RING_LEN; + return (int)copy; +} + +void gem_rx_install(void) +{ + /* RX is polled; the GEM interrupt stays masked (GEM_IDR was set to + * all-ones in amd_eth_init) and no SPI is enabled at the GIC. */ +} diff --git a/src/port/amd/ip/gem_rx_swq_poll.c b/src/port/amd/ip/gem_rx_swq_poll.c new file mode 100644 index 00000000..f8afa727 --- /dev/null +++ b/src/port/amd/ip/gem_rx_swq_poll.c @@ -0,0 +1,29 @@ +/* gem_rx_swq_poll.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Poll-drained swq RX policy (Versal): the GICv3 CPU interface did not + * deliver the GEM SPI in this EL3 bring-up, so gem_eth_poll() calls gem_isr() + * itself from the main loop to drain the hardware RX BD ring into swq[], + * then consumes one slot. The swq machinery and gem_isr/gem_swq_drain live + * in gem_swq.c. + */ +#include "gem_port.h" + +int gem_eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + (void)ll; + gem_isr(); + return gem_swq_drain(buf, len); +} + +/* Poll-only on this board: gem_eth_poll drives gem_isr from the main loop, so + * we deliberately do NOT register/enable the GEM IRQ here. Arming it would + * put a second gem_isr producer (IRQ context) on the single-producer swq + * if GICv3 delivery ever starts working (see the root-cause note on the + * GICv3 group configuration in gic_gicv3.c). */ +void gem_rx_install(void) +{ +} diff --git a/src/port/amd/ip/gem_swq.c b/src/port/amd/ip/gem_swq.c new file mode 100644 index 00000000..aaddcfe2 --- /dev/null +++ b/src/port/amd/ip/gem_swq.c @@ -0,0 +1,125 @@ +/* gem_swq.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Software RX queue + ISR for the swq-based GEM RX models (the AArch64 + * boards). gem_isr() drains the hardware BD ring into a lockless + * single-producer/single-consumer ring; gem_swq_drain() consumes one + * slot and recycles its BD. The gem_eth_poll policy (whether to call + * gem_isr() from the main loop) lives in the per-board gem_rx_*.c. + */ +#include +#include "gem_regs.h" +#include "gem_port.h" +#include "cache.h" +#include "board.h" +#include "gic.h" + +/* Deeper than RX_RING_LEN on purpose: at most RX_RING_LEN buffer descriptors + * are ever owned-by-software at once, so a swq this size can stage every + * outstanding frame and the "queue full" path below never triggers. (When + * SWQ_DEPTH == RX_RING_LEN the queue saturates under sustained RX and the + * full-handling has to recycle/stall BDs, which wedged the RX ring under a + * TCP-rate load that the UDP profile never produced.) */ +#define SWQ_DEPTH 64 + +struct swq_slot { + uint8_t *buf; + uint16_t len; + uint16_t ring_idx; /* into gem_rx_ring[] - recycle after consume */ +}; + +static volatile struct swq_slot swq[SWQ_DEPTH]; +static volatile uint32_t swq_head; /* producer (gem_isr) */ +static volatile uint32_t swq_tail; /* consumer (gem_swq_drain) */ + +/* Fill swq[] from the RX BD ring. Single producer. */ +void gem_isr(void) +{ + uint32_t isr; + + gem_irqs++; + isr = GEM_ISR; + GEM_ISR = isr; /* clear-on-write */ + + /* Invalidate the whole RX ring - the MAC may have written any BD. */ + cache_inval(gem_rx_ring, sizeof(gem_rx_ring)); + + while (gem_rx_ring[gem_rx_next].addr & RXBUF_OWN_SW) { + uint32_t status; + uint32_t next_head = swq_head; + uint32_t slot = next_head % SWQ_DEPTH; + + /* If the SW queue is full, stop draining and leave this BD owned by + * software (enqueued, not yet recycled). Recycling it here would + * hand a buffer still referenced by an outstanding swq slot back to + * the MAC, which could DMA over a frame the consumer is about to + * read. The MAC backpressures via BUFFNA (cleared below); + * gem_swq_drain frees ring slots as the main loop consumes them. + * A BD is therefore recycled ONLY in gem_swq_drain. */ + if ((next_head - swq_tail) >= SWQ_DEPTH) + break; + + status = gem_rx_ring[gem_rx_next].status; + gem_rxframes++; + cache_inval(gem_rx_buf_pool[gem_rx_next], status & RXBUF_LEN_MASK); + + swq[slot].buf = gem_rx_buf_pool[gem_rx_next]; + swq[slot].len = (uint16_t)(status & RXBUF_LEN_MASK); + swq[slot].ring_idx = (uint16_t)gem_rx_next; + __asm__ volatile ("dsb sy" ::: "memory"); + swq_head = next_head + 1; + + gem_rx_next = (gem_rx_next + 1) % RX_RING_LEN; + } + + if (isr & IXR_RXUSED) + GEM_RSR = RSR_BUFFNA; + if (isr & IXR_RXOVR) + GEM_RSR = RSR_RXOVR; +} + +/* Consume one swq slot into buf; recycle its BD to hardware. Single + * consumer. Returns bytes copied (0 if empty). */ +int gem_swq_drain(void *buf, uint32_t len) +{ + uint32_t tail = swq_tail; + uint32_t slot; + uint32_t copy; + uint32_t addr; + uint16_t idx; + + if (tail == swq_head) + return 0; /* SW queue empty */ + + /* Acquire barrier paired with the producer's release (the dsb before + * swq_head in gem_isr): once we have observed the new head, ensure the + * slot's buf/len/ring_idx writes are visible before we read them. Needed + * when the producer is gem_isr() in IRQ context (gem_rx_irq.c); a no-op + * for the poll model where producer and consumer are the same thread. */ + __asm__ volatile ("dmb ld" ::: "memory"); + + slot = tail % SWQ_DEPTH; + copy = swq[slot].len; + if (copy > len) + copy = len; + memcpy(buf, swq[slot].buf, copy); + + /* Recycle the BD back to hardware. */ + idx = swq[slot].ring_idx; + addr = (uint32_t)(uintptr_t)gem_rx_buf_pool[idx]; + addr &= RXBUF_ADDR_MASK; + if (idx == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + gem_rx_ring[idx].status = 0; + __asm__ volatile ("dsb sy" ::: "memory"); + gem_rx_ring[idx].addr = addr; /* OWN bit cleared = HW can write */ + cache_clean(&gem_rx_ring[idx], sizeof(gem_rx_ring[idx])); + + __asm__ volatile ("dsb sy" ::: "memory"); + swq_tail = tail + 1; + + return (int)copy; +} diff --git a/src/port/amd/ip/gic_gicv2.c b/src/port/amd/ip/gic_gicv2.c new file mode 100644 index 00000000..27f12b20 --- /dev/null +++ b/src/port/amd/ip/gic_gicv2.c @@ -0,0 +1,205 @@ +/* gic.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * GICv2 (mem-mapped distributor + CPU interface) minimal driver, + * shared by the GICv2 boards (ZynqMP Cortex-A53 EL3 / Zynq-7000 + * Cortex-A9). Register bases come from board.h. + * Configures all SPIs as Group 0 (IGROUPR bits cleared), level- + * triggered, targeted at CPU0, priority 0xA0. With GICC_CTLR.FIQEn=0 + * a pending Group 0 interrupt is delivered as IRQ, not FIQ. Only + * the SPIs explicitly enabled via gic_enable_spi() will fire. The + * IRQ vector in startup.S funnels into irq_dispatch() here, which + * reads IAR, routes to the registered C handler, and EOIs. + * + * No assumptions about a previous BSP - we initialize the distributor + * and CPU interface from scratch. + */ +#include +#include "board.h" +#include "gic.h" + +/* Distributor registers */ +#define GICD_CTLR (*(volatile uint32_t *)(GICD_BASE + 0x000)) +#define GICD_TYPER (*(volatile uint32_t *)(GICD_BASE + 0x004)) +#define GICD_IGROUPR(n) (*(volatile uint32_t *)(GICD_BASE + 0x080 + 4*(n))) +#define GICD_ISENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x100 + 4*(n))) +#define GICD_ICENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x180 + 4*(n))) +#define GICD_ISPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x200 + 4*(n))) +#define GICD_ICPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x280 + 4*(n))) +#define GICD_IPRIORITYR(n) (*(volatile uint32_t *)(GICD_BASE + 0x400 + 4*(n))) +#define GICD_ITARGETSR(n) (*(volatile uint32_t *)(GICD_BASE + 0x800 + 4*(n))) +#define GICD_ICFGR(n) (*(volatile uint32_t *)(GICD_BASE + 0xC00 + 4*(n))) +#define GICD_SGIR (*(volatile uint32_t *)(GICD_BASE + 0xF00)) + +/* CPU interface registers */ +#define GICC_CTLR (*(volatile uint32_t *)(GICC_BASE + 0x000)) +#define GICC_PMR (*(volatile uint32_t *)(GICC_BASE + 0x004)) +#define GICC_BPR (*(volatile uint32_t *)(GICC_BASE + 0x008)) +#define GICC_IAR (*(volatile uint32_t *)(GICC_BASE + 0x00C)) +#define GICC_EOIR (*(volatile uint32_t *)(GICC_BASE + 0x010)) + +#define GIC_NR_LINES 192 /* GICv2 architecturally supports up to 192 SPIs */ + +static gic_handler_t handlers[GIC_NR_LINES]; +static volatile uint32_t g_irq_total; +static volatile uint32_t g_irq_last_intid; + +void gic_register_handler(uint32_t intid, gic_handler_t fn) +{ + if (intid < GIC_NR_LINES) + handlers[intid] = fn; +} + +static void byte_write(volatile uint32_t *reg, uint32_t intid, uint8_t val) +{ + uint32_t shift; + uint32_t v; + shift = (intid & 3u) * 8u; + v = reg[intid >> 2]; + v &= ~(0xFFu << shift); + v |= ((uint32_t)val << shift); + reg[intid >> 2] = v; +} + +void gic_enable_spi(uint32_t intid, uint32_t priority) +{ + /* Set priority (lower number = higher prio). */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x400), + intid, (uint8_t)(priority & 0xF8u)); + /* Target CPU0. */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x800), + intid, 0x01u); + /* Group 0 (Secure) - we run at EL3 Secure, so Group 0 is the + * correct choice. GICC.FIQEn=0 makes Group 0 route to IRQ, which + * is what our vector table handles. */ + GICD_IGROUPR(intid >> 5) &= ~(1u << (intid & 31u)); + /* Level-triggered (ICFGR bits = 0b00 -> level, 0b10 -> edge). */ + { + uint32_t reg; + uint32_t shift; + shift = (intid & 15u) * 2u; + reg = GICD_ICFGR(intid >> 4); + reg &= ~(3u << shift); + GICD_ICFGR(intid >> 4) = reg; + } + /* Clear pending and enable. */ + GICD_ICPENDR(intid >> 5) = (1u << (intid & 31u)); + GICD_ISENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_disable_spi(uint32_t intid) +{ + GICD_ICENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_init(void) +{ + uint32_t i; + + /* Disable distributor while we reconfigure. */ + GICD_CTLR = 0; + + /* SGIs and PPIs (INTID 0..31): Group 0 Secure, but leave disabled + * for now - enabling them lit up some pending PPI from CSU/PMU + * that hung wolfIP_init when it occupied the CPU interface. */ + GICD_IGROUPR(0) = 0; + GICD_ICENABLER(0) = 0xFFFFFFFFu; + GICD_ICPENDR(0) = 0xFFFFFFFFu; + /* SPIs (INTID 32+): disable all, mark all as Group 0. */ + for (i = 1; i < (GIC_NR_LINES / 32u); i++) { + GICD_ICENABLER(i) = 0xFFFFFFFFu; + GICD_ICPENDR(i) = 0xFFFFFFFFu; + GICD_IGROUPR(i) = 0; + } + /* SGI/PPI priorities (lower 8 entries cover INTID 0..31). */ + for (i = 0; i < 8u; i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_ITARGETSR(i) = 0x01010101u; + for (i = 2u; i < (GIC_NR_LINES / 16u); i++) + GICD_ICFGR(i) = 0; + + /* Enable distributor: both groups (we are at EL3). */ + GICD_CTLR = 0x3u; + + /* CPU interface: priority mask wide open, both groups enabled, + * FIQEn=0 so Group 0 (Secure) interrupts route to nIRQ output + * (per GICv2 IHI 0048B 4.6.4: FIQEn=0 -> nIRQ, FIQEn=1 -> nFIQ). + * AckCtl=1 so Secure reads of GICC_IAR can ack Group 1 too. */ + GICC_PMR = 0xF8u; + GICC_BPR = 0; + GICC_CTLR = 0x07u; /* EnableGrp0 | EnableGrp1 | AckCtl, FIQEn=0 */ +} + +void irq_dispatch(void) +{ + uint32_t iar; + uint32_t intid; + + iar = GICC_IAR; + intid = iar & 0x3FFu; + if (intid >= 1020u) /* 1020-1023 spurious / no pending */ + return; /* do not EOI a spurious INTID */ + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + GICC_EOIR = iar; +} + +uint32_t gic_total_irqs(void) { return g_irq_total; } +uint32_t gic_last_intid(void) { return g_irq_last_intid; } + +uint32_t gic_poll_dispatch(void) +{ + uint32_t n = 0; + uint32_t iar; + uint32_t intid; + + /* Drain up to 8 interrupts per poll to avoid live-locking the + * main loop if a peripheral is hammering us. */ + while (n < 8) { + iar = GICC_IAR; + intid = iar & 0x3FFu; + if (intid >= 1020) /* 1023 spurious / no pending */ + break; + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + GICC_EOIR = iar; + n++; + } + return n; +} + +uint32_t gic_is_pending(uint32_t intid) +{ + return (GICD_ISPENDR(intid >> 5) >> (intid & 31u)) & 1u; +} + +void gic_self_test_sgi(uint32_t intid) +{ + /* GICD_SGIR: TargetListFilter (bits 25:24) = 10 (self), + * SGIINTID (bits 3:0) = intid. */ + GICD_SGIR = (2u << 24) | (intid & 0xFu); +} diff --git a/src/port/amd/ip/gic_gicv3.c b/src/port/amd/ip/gic_gicv3.c new file mode 100644 index 00000000..b3564a89 --- /dev/null +++ b/src/port/amd/ip/gic_gicv3.c @@ -0,0 +1,226 @@ +/* gic.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * GICv3 minimal driver for Cortex-A72 on Versal Gen 1. GICv3 differs + * from the ZCU102's GIC-400 (GICv2) in three structural ways: + * + * - The CPU interface is accessed via AArch64 system registers + * (ICC_*_EL1 / ICC_*_EL3) rather than memory-mapped GICC. + * - Each CPU has its own redistributor (GICR) memory map; SGI/PPI + * enable/priority/group regs live there instead of GICD. + * - Affinity routing is the default; legacy CPU-target byte fields + * do not exist for SPIs. + * + * BL31 (TF-A) on Versal normally initialises the GIC distributor and + * the per-CPU redistributor for us; we only re-prime per-INTID config + * for the SPIs we use and enable the CPU interface for our EL. + * + * Brought up on a VMK180 (Cortex-A72 EL3). Adapted from the GICv2 + * driver under src/port/zcu102/gic.c for GICv3 system registers and + * the per-CPU redistributor. + */ +#include +#include "board.h" +#include "gic.h" + +/* Distributor registers */ +#define GICD_CTLR (*(volatile uint32_t *)(GICD_BASE + 0x000)) +#define GICD_TYPER (*(volatile uint32_t *)(GICD_BASE + 0x004)) +#define GICD_IGROUPR(n) (*(volatile uint32_t *)(GICD_BASE + 0x080 + 4*(n))) +#define GICD_ISENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x100 + 4*(n))) +#define GICD_ICENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x180 + 4*(n))) +#define GICD_ISPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x200 + 4*(n))) +#define GICD_ICPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x280 + 4*(n))) +#define GICD_IPRIORITYR(n) (*(volatile uint32_t *)(GICD_BASE + 0x400 + 4*(n))) +#define GICD_ICFGR(n) (*(volatile uint32_t *)(GICD_BASE + 0xC00 + 4*(n))) +#define GICD_IROUTER(n) (*(volatile uint64_t *)(GICD_BASE + 0x6000 + 8*(n))) + +#define GICD_CTLR_ARE_S (1u << 4) +#define GICD_CTLR_ARE_NS (1u << 5) +#define GICD_CTLR_ENG0 (1u << 0) +#define GICD_CTLR_ENG1S (1u << 2) + +/* Redistributor for CPU 0 */ +#define GICR_CTLR (*(volatile uint32_t *)(GICR_BASE + 0x000)) +#define GICR_WAKER (*(volatile uint32_t *)(GICR_BASE + 0x014)) + +#define GICR_SGI_BASE (GICR_BASE + 0x10000) +#define GICR_IGROUPR0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x080)) +#define GICR_ISENABLER0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x100)) +#define GICR_ICENABLER0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x180)) +#define GICR_IPRIORITYR(n) (*(volatile uint32_t *)(GICR_SGI_BASE + 0x400 + 4*(n))) +#define GICR_ICFGR0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0xC00)) +#define GICR_ICFGR1 (*(volatile uint32_t *)(GICR_SGI_BASE + 0xC04)) + +#define GICR_WAKER_PS (1u << 1) +#define GICR_WAKER_CA (1u << 2) + +#define GIC_NR_LINES 224 +static gic_handler_t handlers[GIC_NR_LINES]; +static volatile uint32_t g_irq_total; +static volatile uint32_t g_irq_last_intid; + +void gic_register_handler(uint32_t intid, gic_handler_t fn) +{ + if (intid < GIC_NR_LINES) + handlers[intid] = fn; +} + +/* ICC_*_EL1 / ICC_*_EL3 system register accessors. The encoded + * MSR/MRS forms below avoid relying on a particular assembler + * version supporting the symbolic names. */ +static inline void icc_sre_el3_set(uint64_t v) +{ + __asm__ volatile ("msr S3_6_C12_C12_5, %0" :: "r"(v)); + __asm__ volatile ("isb" ::: "memory"); +} + +static inline void icc_pmr_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C4_C6_0, %0" :: "r"(v)); +} + +static inline void icc_igrpen1_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_7, %0" :: "r"(v)); +} + +static inline void icc_igrpen0_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_6, %0" :: "r"(v)); +} + +static inline uint64_t icc_iar1_el1_read(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, S3_0_C12_C12_0" : "=r"(v)); + return v; +} + +static inline void icc_eoir1_el1_write(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_1, %0" :: "r"(v)); +} + +static inline void icc_ctlr_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_4, %0" :: "r"(v)); +} + +static void gicr_wakeup(void) +{ + uint32_t waker = GICR_WAKER; + waker &= ~GICR_WAKER_PS; + GICR_WAKER = waker; + while (GICR_WAKER & GICR_WAKER_CA) + ; +} + +void gic_init(void) +{ + uint32_t i; + + GICD_CTLR = GICD_CTLR_ARE_S | GICD_CTLR_ENG1S; + + for (i = 1; i < (GIC_NR_LINES / 32u); i++) { + GICD_IGROUPR(i) = 0xFFFFFFFFu; + GICD_ICENABLER(i) = 0xFFFFFFFFu; + } + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 32; i < GIC_NR_LINES; i++) + GICD_IROUTER(i) = 0; + + gicr_wakeup(); + GICR_IGROUPR0 = 0xFFFFFFFFu; + GICR_ICENABLER0 = 0xFFFFFFFFu; + for (i = 0; i < 8; i++) + GICR_IPRIORITYR(i) = 0xA0A0A0A0u; + + icc_sre_el3_set(0xF); + icc_pmr_el1_set(0xF8); + icc_ctlr_el1_set(0); + icc_igrpen1_el1_set(1); + icc_igrpen0_el1_set(1); +} + +/* NOTE: the GICv3 SPI-IRQ delivery path is intentionally UNUSED and + * UNVERIFIED on these ports. Versal drives RX by polling gem_isr() from the + * main loop (ip/gem_rx_swq_poll.c; gem_rx_install() is a no-op and never + * calls gic_enable_spi()). The Group / IGROUPR programming here has not been + * validated to deliver an SPI to the Secure EL3 context and would need + * review (Group 0 Secure or Group 1 Secure via IGRPMODR, plus the EL3 + * ICC_* registers) before enabling IRQ-driven RX. */ +void gic_enable_spi(uint32_t intid, uint32_t priority) +{ + uint32_t reg, shift; + volatile uint8_t *prio_byte; + + prio_byte = (volatile uint8_t *)(GICD_BASE + 0x400); + prio_byte[intid] = (uint8_t)(priority & 0xF8u); + + GICD_IGROUPR(intid >> 5) |= (1u << (intid & 31u)); + GICD_IROUTER(intid) = 0; + + shift = (intid & 15u) * 2u; + reg = GICD_ICFGR(intid >> 4); + reg &= ~(3u << shift); + GICD_ICFGR(intid >> 4) = reg; + + GICD_ICPENDR(intid >> 5) = (1u << (intid & 31u)); + GICD_ISENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void irq_dispatch(void) +{ + uint64_t iar; + uint32_t intid; + + iar = icc_iar1_el1_read(); + intid = (uint32_t)(iar & 0xFFFFFFu); + if (intid >= 1020u) /* 1020-1023 spurious / no pending */ + return; /* do not EOI a spurious INTID */ + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + icc_eoir1_el1_write(iar); +} + +uint32_t gic_total_irqs(void) { return g_irq_total; } +uint32_t gic_last_intid(void) { return g_irq_last_intid; } + +uint32_t gic_is_pending(uint32_t intid) +{ + return (GICD_ISPENDR(intid >> 5) >> (intid & 31u)) & 1u; +} + +void gic_disable_spi(uint32_t intid) +{ + GICD_ICENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_self_test_sgi(uint32_t intid) +{ + /* GICv3 ICC_SGI1R_EL1: target self via target list 1 */ + uint64_t v = ((uint64_t)(intid & 0xF) << 24) | 1u; + __asm__ volatile ("msr S3_0_C12_C11_5, %0" :: "r"(v)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/amd/ip/phy_dispatch_dp83867.c b/src/port/amd/ip/phy_dispatch_dp83867.c new file mode 100644 index 00000000..82229f2a --- /dev/null +++ b/src/port/amd/ip/phy_dispatch_dp83867.c @@ -0,0 +1,21 @@ +/* phy_dispatch_dp83867.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * PHY dispatch for boards that fit only the TI DP83867 (ZCU102, VMK180). + */ +#include "gem_port.h" +#include "phy_dp83867.h" + +int gem_phy_init(uint8_t phy_addr, uint16_t id1, int *speed, int *full_duplex) +{ + (void)id1; + return dp83867_init(phy_addr, speed, full_duplex); +} + +int gem_phy_link_status(uint8_t phy_addr) +{ + return dp83867_link_status(phy_addr); +} diff --git a/src/port/amd/ip/phy_dispatch_multi.c b/src/port/amd/ip/phy_dispatch_multi.c new file mode 100644 index 00000000..27d4ea9a --- /dev/null +++ b/src/port/amd/ip/phy_dispatch_multi.c @@ -0,0 +1,29 @@ +/* phy_dispatch_multi.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * PHY dispatch for boards that may fit either a Marvell 88E1518 or a TI + * DP83867 (Zynq-7000: the ZC702 fits the Marvell). Dispatch is on the + * PHY vendor OUI in MII register 2 (id1). + */ +#include "gem_port.h" +#include "phy_dp83867.h" +#include "phy_marvell.h" +#include "uart.h" + +int gem_phy_init(uint8_t phy_addr, uint16_t id1, int *speed, int *full_duplex) +{ + if ((id1 & 0xFFFFu) == MARVELL_PHY_ID1) { + uart_puts("GEM: PHY is Marvell 88E1518\n"); + return marvell_88e1518_init(phy_addr, speed, full_duplex); + } + return dp83867_init(phy_addr, speed, full_duplex); +} + +int gem_phy_link_status(uint8_t phy_addr) +{ + /* Generic clause-22 BMSR read works for both PHYs. */ + return dp83867_link_status(phy_addr); +} diff --git a/src/port/amd/ip/phy_dp83867.c b/src/port/amd/ip/phy_dp83867.c new file mode 100644 index 00000000..a57d45f8 --- /dev/null +++ b/src/port/amd/ip/phy_dp83867.c @@ -0,0 +1,338 @@ +/* phy_dp83867.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * TI DP83867IR PHY init (RGMII to the PS GEM). + * + * The DP83867 needs explicit RGMII TX and RX clock delay configuration + * (CFG4 / RGMIICTL extended registers) because these boards route + * RGMII signals as a straight-through trace without external delay. + * Without this the link comes up at 1 Gbps but carries corrupt data + * (random RX frames, no TX). The Linux dp83867 driver and Xilinx + * device tree both apply a 2.0 ns TX + 2.0 ns RX skew - we match. + * + * Extended registers (>0x1F) are accessed via the IEEE-defined indirect + * pair (REGCR=0x0D, ADDAR=0x0E): + * 1. Write REGCR = 0x001F (address-of, devad 31). + * 2. Write ADDAR = . + * 3. Write REGCR = 0x401F (data, devad 31, no-increment). + * 4. Read/Write ADDAR = . + */ +#include +#include "gem.h" +#include "phy_dp83867.h" +#include "timer.h" +#include "uart.h" + +/* Standard IEEE PHY registers (clause 22) */ +#define PHY_BMCR 0x00 +#define PHY_BMSR 0x01 +#define PHY_ID1 0x02 +#define PHY_ID2 0x03 +#define PHY_ANAR 0x04 +#define PHY_GBCR 0x09 +#define PHY_GBSR 0x0A +#define PHY_REGCR 0x0D +#define PHY_ADDAR 0x0E + +#define BMCR_RESET (1u << 15) +#define BMCR_ANRESTART (1u << 9) +#define BMCR_ANEN (1u << 12) + +#define BMSR_ANCOMPLETE (1u << 5) +#define BMSR_LINK_UP (1u << 2) + +/* DP83867 extended registers (accessed via REGCR/ADDAR, devad 0x1F) */ +#define DP83867_CFG4 0x0031 /* Configuration 4 (RX_CTRL strap fix) */ +#define DP83867_RGMIICTL 0x0032 /* RGMII control */ +#define DP83867_STRAP_STS1 0x006E /* Strap status register (read-only) */ +#define DP83867_RGMIIDCTL 0x0086 /* RGMII delay control */ +#define DP83867_IO_MUX_CFG 0x0170 /* IO MUX config (impedance) */ + +/* Clause-22 register (direct access) */ +#define DP83867_PHYCR 0x10 /* PHY Control register */ +#define PHYCR_FIFO_DEPTH_MASK (3u << 14) +#define PHYCR_FIFO_DEPTH_8B (3u << 14) + +/* RGMIICTL bits */ +#define RGMIICTL_RX_DELAY_EN (1u << 0) +#define RGMIICTL_TX_DELAY_EN (1u << 1) + +/* RGMIIDCTL: TX delay in [3:0], RX delay in [7:4], each step ~0.25 ns. + * 0x8 -> 2.0 ns (matches the Linux/Xilinx default). */ +#define RGMIIDCTL_DELAY_2NS (0x8u | (0x8u << 4)) + +/* Speed read from PHY status register (DP83867 0x11) */ +#define DP83867_PHYSTS 0x0011 +#define PHYSTS_SPEED_MASK (3u << 14) +#define PHYSTS_SPEED_1000 (2u << 14) +#define PHYSTS_SPEED_100 (1u << 14) +#define PHYSTS_SPEED_10 (0u << 14) +#define PHYSTS_DUPLEX (1u << 13) + +static int phy_ext_write(uint8_t phy_addr, uint16_t ext_reg, uint16_t val) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_write(phy_addr, PHY_ADDAR, val); +} + +static int phy_ext_read(uint8_t phy_addr, uint16_t ext_reg, uint16_t *out) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_read(phy_addr, PHY_ADDAR, out); +} + +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out) +{ + uint16_t id1 = 0; + uint16_t id2 = 0; + uint16_t bmcr; + uint16_t bmsr; + uint16_t physts; + int i; + + if (gem_mdio_read(phy_addr, PHY_ID1, &id1) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_ID2, &id2) < 0) + return -2; + uart_puts("DP83867: ID1="); uart_puthex(id1); + uart_puts(" ID2="); uart_puthex(id2); + uart_puts("\n"); + /* DP83867 OUI = 0x2000A23x. ID1=0x2000, ID2 upper bits match. */ + if (id1 != 0x2000u || (id2 & 0xFFF0u) != 0xA230u) { + uart_puts(" warn: PHY ID does not match DP83867, continuing\n"); + } + + /* Soft reset. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_RESET) < 0) + return -3; + for (i = 0; i < 1000; i++) { + delay_ms(1); + if (gem_mdio_read(phy_addr, PHY_BMCR, &bmcr) < 0) + return -4; + if ((bmcr & BMCR_RESET) == 0) + break; + } + if (i == 1000) + return -5; + + /* Order below mirrors the Linux/U-Boot dp83867_config sequence: + * 1. Strap fix (CFG4 bit 7) right after SW reset. + * 2. PHYCR FIFO depth RMW. + * 3. RGMIICTL RMW to enable both delays. + * 4. RGMIIDCTL set delay values. + * 5. Restart AN (caller does after we return). + */ + { + uint16_t strap = 0; + uint16_t cfg4_before = 0; + uint16_t cfg4_after = 0; + uint16_t iomux = 0; + uint16_t rgmiictl = 0; + uint16_t phycr_before = 0; + uint16_t phycr_after = 0; + + (void)phy_ext_read(phy_addr, DP83867_STRAP_STS1, &strap); + (void)phy_ext_read(phy_addr, DP83867_IO_MUX_CFG, &iomux); + (void)phy_ext_read(phy_addr, DP83867_CFG4, &cfg4_before); + + /* 1. RX_CTRL strap quirk. */ + cfg4_after = cfg4_before & ~(1u << 7); + if (phy_ext_write(phy_addr, DP83867_CFG4, cfg4_after) < 0) + return -6; + + /* 2. PHYCR FIFO depth = 8 bytes (RMW so we keep Auto-MDIX, + * power-down detect, etc., that the strap brought up). */ + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &phycr_before); + phycr_after = (phycr_before & ~PHYCR_FIFO_DEPTH_MASK) + | PHYCR_FIFO_DEPTH_8B; + if (gem_mdio_write(phy_addr, DP83867_PHYCR, phycr_after) < 0) + return -7; + + /* 3. RGMIICTL: enable TX and RX clock delays (RMW). */ + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &rgmiictl); + rgmiictl |= RGMIICTL_RX_DELAY_EN | RGMIICTL_TX_DELAY_EN; + if (phy_ext_write(phy_addr, DP83867_RGMIICTL, rgmiictl) < 0) + return -8; + + /* 4. RGMIIDCTL: 2.0 ns each (matches Linux ti,*-internal-delay=8). */ + if (phy_ext_write(phy_addr, DP83867_RGMIIDCTL, + RGMIIDCTL_DELAY_2NS) < 0) + return -9; + +#ifdef DEBUG_PHY + /* Verbose pre-AN dump so we can diff against U-Boot's state. */ + uart_puts("DP83867 pre-AN: STRAP_STS1="); uart_puthex(strap); + uart_puts(" IO_MUX_CFG="); uart_puthex(iomux); + uart_puts("\n CFG4: "); uart_puthex(cfg4_before); + uart_puts(" -> "); uart_puthex(cfg4_after); + uart_puts(" PHYCR: "); uart_puthex(phycr_before); + uart_puts(" -> "); uart_puthex(phycr_after); + uart_puts("\n RGMIICTL="); uart_puthex(rgmiictl); + uart_puts(" RGMIIDCTL="); uart_puthex(RGMIIDCTL_DELAY_2NS); + uart_puts("\n"); + + { + uint16_t v; + (void)phy_ext_read(phy_addr, DP83867_CFG4, &v); + uart_puts("DP83867 readback: CFG4="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &v); + uart_puts(" RGMIICTL="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIIDCTL, &v); + uart_puts(" RGMIIDCTL="); uart_puthex(v); + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &v); + uart_puts(" PHYCR="); uart_puthex(v); + uart_puts("\n"); + } +#else + (void)strap; (void)iomux; + (void)cfg4_before; (void)cfg4_after; + (void)phycr_before; (void)phycr_after; + (void)rgmiictl; +#endif + } + + /* Advertise 10/100/1000 full + half duplex. */ + if (gem_mdio_write(phy_addr, PHY_ANAR, 0x01E1u) < 0) + return -13; + if (gem_mdio_write(phy_addr, PHY_GBCR, (1u << 9) | (1u << 8)) < 0) + return -14; + + /* Restart AN. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_ANEN | BMCR_ANRESTART) < 0) + return -10; + + /* Wait up to 5 s for AN complete, polling at 50 ms. AN typically + * needs 100-1500 ms depending on link partner. Report progress so + * a hung negotiation is visible on UART. */ + uart_puts("DP83867: waiting for autoneg"); + for (i = 0; i < 100; i++) { + delay_ms(50); + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -11; + if (bmsr & BMSR_ANCOMPLETE) { + uart_puts(" done ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_ANCOMPLETE)) + uart_puts(" TIMEOUT\n"); + + /* Give the PHY a moment to latch the negotiated speed before we + * read PHYSTS - on DP83867 link-OK and PHYSTS update slightly + * after AN_COMPLETE asserts. */ + delay_ms(100); + + /* After AN_COMPLETE, the 1000BASE-T link still needs to finish + * master/slave training and have BOTH receivers report OK before + * BMSR.LINK_UP asserts. This can take several hundred ms more. + * Poll BMSR (double-read for latch) up to 5 s, dumping GBSR each + * iteration so we can see remote_rx_status flip. */ + { + int j; + uint16_t gbsr = 0; + uint16_t bmsr2 = 0; + uart_puts("DP83867: waiting for link"); + for (j = 0; j < 100; j++) { + delay_ms(50); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + if (bmsr2 & BMSR_LINK_UP) { + uart_puts(" UP ("); + uart_putdec((uint32_t)j * 50u); + uart_puts("ms) GBSR="); + uart_puthex(gbsr); + uart_puts("\n"); + bmsr = bmsr2; + break; + } + if ((j % 10) == 9) { + uart_puts(" ["); + uart_putdec((uint32_t)(j + 1) * 50u); + uart_puts("ms GBSR="); + uart_puthex(gbsr); + uart_puts("]"); + } + } + if (!(bmsr2 & BMSR_LINK_UP)) + uart_puts(" TIMEOUT\n"); + } + + if (gem_mdio_read(phy_addr, DP83867_PHYSTS, &physts) < 0) + return -12; + +#ifdef DEBUG_PHY + { + uint16_t bmcr_now = 0; + uint16_t lpa = 0; + uint16_t gbsr = 0; + (void)gem_mdio_read(phy_addr, PHY_BMCR, &bmcr_now); + (void)gem_mdio_read(phy_addr, 0x05, &lpa); /* MII LPA */ + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + uart_puts("DP83867 regs: BMCR="); uart_puthex(bmcr_now); + uart_puts(" BMSR="); uart_puthex(bmsr); + uart_puts(" LPA="); uart_puthex(lpa); + uart_puts(" GBSR="); uart_puthex(gbsr); + uart_puts(" PHYSTS="); uart_puthex(physts); + uart_puts("\n"); + } +#endif + + if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_1000) + *speed_out = 1000; + else if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_100) + *speed_out = 100; + else + *speed_out = 10; + *full_duplex_out = (physts & PHYSTS_DUPLEX) ? 1 : 0; + + uart_puts("DP83867 link: "); + uart_putdec((uint32_t)*speed_out); + uart_puts(*full_duplex_out ? " Mbps FD\n" : " Mbps HD\n"); + + return 0; /* init OK; link state is read via dp83867_link_status() */ +} + +int dp83867_link_status(uint8_t phy_addr) +{ + uint16_t bmsr; + /* BMSR latches link down; read twice. */ + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} diff --git a/src/port/amd/ip/phy_dp83867.h b/src/port/amd/ip/phy_dp83867.h new file mode 100644 index 00000000..acb351cb --- /dev/null +++ b/src/port/amd/ip/phy_dp83867.h @@ -0,0 +1,25 @@ +/* phy_dp83867.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * TI DP83867IR PHY driver: 10/100/1000 RGMII PHY. On some boards the + * PHY vendor varies (e.g. the ZC702 fits a Marvell 88E1518; the GEM + * driver dispatches on the MDIO vendor ID). We only need configuration + * (reset, RGMII TX/RX skew, auto-negotiation) and link status; no + * advanced features. + */ +#ifndef AMD_PHY_DP83867_H +#define AMD_PHY_DP83867_H + +#include + +/* Returns 0 on success, < 0 on failure. On success *speed and *fd are + * the negotiated speed (10/100/1000) and full-duplex flag. */ +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out); + +/* Returns 1 if link is up, 0 if down, < 0 on MDIO error. */ +int dp83867_link_status(uint8_t phy_addr); + +#endif /* AMD_PHY_DP83867_H */ diff --git a/src/port/amd/ip/phy_marvell.c b/src/port/amd/ip/phy_marvell.c new file mode 100644 index 00000000..f177d703 --- /dev/null +++ b/src/port/amd/ip/phy_marvell.c @@ -0,0 +1,168 @@ +/* phy_marvell.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Marvell 88E1518 (Alaska) RGMII PHY driver for the Xilinx ZC702 + * on-board PS-GEM RJ45. The DP83867 driver cannot be reused here: the + * 88E1518 uses Marvell's paged register model and a different RGMII + * internal-delay control, and the DP83867 MMD writes (registers 13/14) + * land on Marvell MMD space and prevent auto-negotiation from + * completing. This driver does a clean reset, programs the RGMII RX/TX + * internal delays, then runs standard clause-22 auto-negotiation. + */ +#include +#include "gem.h" +#include "uart.h" +#include "timer.h" +#include "phy_marvell.h" + +/* Standard clause-22 MII registers (page 0). */ +#define MII_BMCR 0x00u +#define MII_BMSR 0x01u +#define MII_ID1 0x02u +#define MII_ID2 0x03u +#define MII_ANAR 0x04u +#define MII_GBCR 0x09u /* 1000BASE-T control */ + +#define BMCR_RESET (1u << 15) +#define BMCR_ANEN (1u << 12) +#define BMCR_ANRESTART (1u << 9) + +#define BMSR_ANCOMPLETE (1u << 5) +#define BMSR_LINK_UP (1u << 2) + +/* Marvell paging: register 22 selects the page for registers 0..21. */ +#define MARVELL_PAGE_SEL 22u + +/* Page 2, register 21: MAC Specific Control Register 2. RGMII internal + * delay enables live here. */ +#define MARVELL_PAGE_MAC 2u +#define M88E1518_MAC_CTRL2 21u +#define MAC_CTRL2_RX_DELAY (1u << 5) /* add internal delay to RGMII RXCLK */ +#define MAC_CTRL2_TX_DELAY (1u << 4) /* add internal delay to RGMII TXCLK */ + +/* Page 0, register 17: Copper Specific Status Register 1. */ +#define M88E1518_COPPER_STS 17u +#define COPPER_STS_SPEED_MASK (3u << 14) +#define COPPER_STS_SPEED_1000 (2u << 14) +#define COPPER_STS_SPEED_100 (1u << 14) +#define COPPER_STS_FULL_DUPLEX (1u << 13) +#define COPPER_STS_RESOLVED (1u << 11) + +static int marvell_set_page(uint8_t phy_addr, uint16_t page) +{ + return gem_mdio_write(phy_addr, MARVELL_PAGE_SEL, page); +} + +int marvell_88e1518_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out) +{ + uint16_t reg; + uint16_t bmsr = 0; + int i; + int speed = 10; + int fd = 0; + + /* Program the RGMII RX/TX internal delays (page 2, register 21). The + * 88E1518 latches these on the next software reset, so set them + * before the reset below. */ + if (marvell_set_page(phy_addr, MARVELL_PAGE_MAC) < 0) + return -1; + /* From here the PHY is on page 2; restore page 0 on every error exit so + * later generic clause-22 accesses (BMSR/BMCR on page 0) are not left + * pointing at the MAC page. */ + if (gem_mdio_read(phy_addr, M88E1518_MAC_CTRL2, ®) < 0) { + (void)marvell_set_page(phy_addr, 0); + return -2; + } + reg |= (MAC_CTRL2_RX_DELAY | MAC_CTRL2_TX_DELAY); + if (gem_mdio_write(phy_addr, M88E1518_MAC_CTRL2, reg) < 0) { + (void)marvell_set_page(phy_addr, 0); + return -3; + } + if (marvell_set_page(phy_addr, 0) < 0) + return -4; + + /* Software reset to apply the delay configuration. */ + if (gem_mdio_write(phy_addr, MII_BMCR, BMCR_RESET) < 0) + return -5; + for (i = 0; i < 100; i++) { + delay_ms(5); + if (gem_mdio_read(phy_addr, MII_BMCR, ®) < 0) + return -6; + if (!(reg & BMCR_RESET)) + break; + } + if (reg & BMCR_RESET) + return -7; + + /* Advertise 10/100 (full+half) and 1000 (full+half), then restart + * auto-negotiation. */ + if (gem_mdio_write(phy_addr, MII_ANAR, 0x01E1u) < 0) + return -8; + if (gem_mdio_write(phy_addr, MII_GBCR, (1u << 9) | (1u << 8)) < 0) + return -9; + if (gem_mdio_write(phy_addr, MII_BMCR, BMCR_ANEN | BMCR_ANRESTART) < 0) + return -10; + + /* Wait up to 5 s for AN to complete, 50 ms poll. */ + uart_puts("88E1518: waiting for autoneg"); + for (i = 0; i < 100; i++) { + delay_ms(50); + if (gem_mdio_read(phy_addr, MII_BMSR, &bmsr) < 0) + return -11; + if (bmsr & BMSR_ANCOMPLETE) { + uart_puts(" done ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_ANCOMPLETE)) + uart_puts(" TIMEOUT\n"); + + /* Wait up to 5 s for the copper link to come up (BMSR is latch-low, + * so double-read). */ + uart_puts("88E1518: waiting for link"); + for (i = 0; i < 100; i++) { + delay_ms(50); + (void)gem_mdio_read(phy_addr, MII_BMSR, &bmsr); + (void)gem_mdio_read(phy_addr, MII_BMSR, &bmsr); + if (bmsr & BMSR_LINK_UP) { + uart_puts(" UP ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_LINK_UP)) + uart_puts(" TIMEOUT\n"); + + /* Read the resolved speed/duplex from the Copper Specific Status + * register (page 0, register 17). */ + if (gem_mdio_read(phy_addr, M88E1518_COPPER_STS, ®) < 0) + return -12; +#ifdef DEBUG_PHY + uart_puts("88E1518 copper status="); uart_puthex(reg); uart_puts("\n"); +#endif + if (reg & COPPER_STS_RESOLVED) { + if ((reg & COPPER_STS_SPEED_MASK) == COPPER_STS_SPEED_1000) + speed = 1000; + else if ((reg & COPPER_STS_SPEED_MASK) == COPPER_STS_SPEED_100) + speed = 100; + else + speed = 10; + fd = (reg & COPPER_STS_FULL_DUPLEX) ? 1 : 0; + } + + if (speed_out) + *speed_out = speed; + if (full_duplex_out) + *full_duplex_out = fd; + return 0; +} diff --git a/src/port/amd/ip/phy_marvell.h b/src/port/amd/ip/phy_marvell.h new file mode 100644 index 00000000..580d3552 --- /dev/null +++ b/src/port/amd/ip/phy_marvell.h @@ -0,0 +1,24 @@ +/* phy_marvell.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Marvell 88E1518 (Alaska) RGMII PHY driver. This is the PHY fitted to + * the on-board PS-GEM RJ45 on the Xilinx ZC702 (PHY OUI 0x0141), as + * opposed to the TI DP83867 used on the ZCU102. We only need RGMII + * delay configuration, auto-negotiation and link/speed status. + */ +#ifndef AMD_PHY_MARVELL_H +#define AMD_PHY_MARVELL_H + +#include + +/* PHY ID1 (MII register 2) OUI high word for Marvell. */ +#define MARVELL_PHY_ID1 0x0141u + +/* Returns 0 on success, < 0 on failure. On success *speed_out and + * *full_duplex_out are the negotiated speed (10/100/1000) and FD flag. */ +int marvell_88e1518_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out); + +#endif /* AMD_PHY_MARVELL_H */ diff --git a/src/port/amd/ip/uart_cadence.c b/src/port/amd/ip/uart_cadence.c new file mode 100644 index 00000000..95b1b1df --- /dev/null +++ b/src/port/amd/ip/uart_cadence.c @@ -0,0 +1,85 @@ +/* uart_cadence.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence (Xilinx PS) UART polled driver, shared by the Cadence-UART + * boards (ZynqMP UART0, Zynq-7000 UART1). The console base is UART_BASE + * (board.h). If the board defines UART_PROGRAM_BAUD we program 8N1 and + * the board's baud divisors (UART_BAUDGEN_CD / UART_BAUDDIV_BDIV); + * otherwise we trust the FSBL's baud setup and only enable TX/RX. + * + * Register reference: Xilinx PS UART (Cadence) -- ZynqMP UG1085 / + * Zynq-7000 UG585 "UART Controller". + */ +#include +#include "board.h" +#include "uart.h" + +#define UART_CR (*(volatile uint32_t *)(UART_BASE + 0x00)) +#define UART_MR (*(volatile uint32_t *)(UART_BASE + 0x04)) +#define UART_BAUDGEN (*(volatile uint32_t *)(UART_BASE + 0x18)) +#define UART_BAUD_DIV (*(volatile uint32_t *)(UART_BASE + 0x34)) +#define UART_CHANNEL_STS (*(volatile uint32_t *)(UART_BASE + 0x2C)) +#define UART_TX_RX_FIFO (*(volatile uint32_t *)(UART_BASE + 0x30)) + +/* Control register bits */ +#define UART_CR_TXRES (1u << 1) /* TX software reset */ +#define UART_CR_RXRES (1u << 0) /* RX software reset */ +#define UART_CR_TXEN (1u << 4) +#define UART_CR_TXDIS (1u << 5) +#define UART_CR_RXEN (1u << 2) +#define UART_CR_RXDIS (1u << 3) +#define UART_CR_STPBRK (1u << 8) + +/* Mode register: 8N1, normal, no parity */ +#define UART_MR_8N1 ((0u << 8) | (4u << 3) | (0u << 1)) + +/* Channel status */ +#define UART_SR_TXFULL (1u << 4) +#define UART_SR_TXEMPTY (1u << 3) + +void uart_init(void) +{ +#ifdef UART_PROGRAM_BAUD + /* Program 8N1 + baud. The board supplies divisors sized for its + * UART_REF_CLK via UART_BAUDGEN_CD / UART_BAUDDIV_BDIV (Cadence: + * baud = sel_clk / (CD * (BDIV + 1))). Boards whose FSBL already set + * the baud (and whose ref clock is not known here) leave + * UART_PROGRAM_BAUD undefined -- reprogramming would garble output. */ + UART_CR = UART_CR_TXDIS | UART_CR_RXDIS; + UART_CR |= UART_CR_TXRES | UART_CR_RXRES; + while (UART_CR & (UART_CR_TXRES | UART_CR_RXRES)) + ; /* wait for reset to self-clear */ + + UART_MR = UART_MR_8N1; + UART_BAUDGEN = UART_BAUDGEN_CD; + UART_BAUD_DIV = UART_BAUDDIV_BDIV; +#endif + UART_CR = UART_CR_TXEN | UART_CR_RXEN | UART_CR_STPBRK; +} + +void uart_putc(char c) +{ + while (UART_CHANNEL_STS & UART_SR_TXFULL) + ; + UART_TX_RX_FIFO = (uint32_t)(unsigned char)c; +} + +/* uart_puts / uart_puthex / uart_putdec / uart_putip4 are shared and live + * in common/uart_util.c. */ diff --git a/src/port/amd/ip/uart_pl011.c b/src/port/amd/ip/uart_pl011.c new file mode 100644 index 00000000..3777ca02 --- /dev/null +++ b/src/port/amd/ip/uart_pl011.c @@ -0,0 +1,89 @@ +/* uart_pl011.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARM PL011 UART polled driver. Versal routes UART0 to the on-board + * USB-UART on VMK180. We assume the PLM has already pinned the UART + * pins via the LPD configuration object and enabled the reference + * clock (typically 100 MHz IOPLL-derived); this driver programs the + * baud divisors and enables TX/RX. + * + * Register reference: ARM PrimeCell PL011 UART (DDI 0183). The Versal + * versal.dtsi maps PL011 base addresses to 0xFF000000 (UART0) and + * 0xFF010000 (UART1). + * + * Brought up on a VMK180 (Cortex-A72 EL3). + */ +#include +#include "board.h" +#include "uart.h" + +/* PL011 registers, all 32-bit. */ +#define UART_DR (*(volatile uint32_t *)(UART_BASE + 0x000)) /* data */ +#define UART_FR (*(volatile uint32_t *)(UART_BASE + 0x018)) /* flag */ +#define UART_IBRD (*(volatile uint32_t *)(UART_BASE + 0x024)) /* int baud */ +#define UART_FBRD (*(volatile uint32_t *)(UART_BASE + 0x028)) /* frac baud */ +#define UART_LCR_H (*(volatile uint32_t *)(UART_BASE + 0x02C)) /* line ctrl */ +#define UART_CR (*(volatile uint32_t *)(UART_BASE + 0x030)) /* control */ +#define UART_IMSC (*(volatile uint32_t *)(UART_BASE + 0x038)) /* irq mask */ +#define UART_ICR (*(volatile uint32_t *)(UART_BASE + 0x044)) /* irq clr */ + +#define UART_FR_TXFF (1u << 5) +#define UART_FR_TXFE (1u << 7) +#define UART_FR_BUSY (1u << 3) + +#define UART_LCR_H_WLEN_8 (3u << 5) /* 8-bit word length */ +#define UART_LCR_H_FEN (1u << 4) /* FIFO enable */ + +#define UART_CR_UARTEN (1u << 0) +#define UART_CR_TXE (1u << 8) +#define UART_CR_RXE (1u << 9) + +/* Baud formulas (PL011): + * BAUDDIV = (UARTCLK * 4) / baud + * IBRD = BAUDDIV / 64 + * FBRD = BAUDDIV % 64 + * For UARTCLK = 100 MHz, baud = 115200: + * BAUDDIV = (100e6 * 4) / 115200 = 3472 + * IBRD = 3472 / 64 = 54 + * FBRD = 3472 % 64 = 16 + * Actual baud = (100e6 * 4) / ((54 * 64) + 16) = 100e6 / 868 = 115207 */ +#define UART_IBRD_115200 54 +#define UART_FBRD_115200 16 + +void uart_init(void) +{ + UART_CR = 0; /* disable while configuring */ + UART_ICR = 0x7FF; /* clear all interrupts */ + UART_IMSC = 0; /* mask all interrupts */ + UART_IBRD = UART_IBRD_115200; + UART_FBRD = UART_FBRD_115200; + UART_LCR_H = UART_LCR_H_WLEN_8 | UART_LCR_H_FEN; + UART_CR = UART_CR_UARTEN | UART_CR_TXE | UART_CR_RXE; +} + +void uart_putc(char c) +{ + while (UART_FR & UART_FR_TXFF) + ; + UART_DR = (uint32_t)(unsigned char)c; +} + +/* uart_puts / uart_puthex / uart_putdec / uart_putip4 are shared and live + * in common/uart_util.c. */ diff --git a/tools/scripts/amd/qemu-smoke.sh b/tools/scripts/amd/qemu-smoke.sh new file mode 100755 index 00000000..6d0bccc1 --- /dev/null +++ b/tools/scripts/amd/qemu-smoke.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# +# QEMU boot smoke test for a wolfIP AMD/Xilinx bare-metal port. +# +# Boots boards//app.elf under the matching mainline Xilinx QEMU machine, +# captures the UART console, and asserts the app reaches its "Ready" banner - +# i.e. startup (EL3/SVC), MMU, GIC, UART, the GEM bring-up and the wolfIP main +# loop all execute under emulation. The PHY autoneg/link waits time out +# gracefully under QEMU (no real link), and DHCP then falls back to a static IP, +# so reaching "Ready" is a robust gate that does not depend on QEMU's GEM PHY +# reporting link. (Full DHCP/echo over QEMU user-net is a separate, best-effort +# concern and is intentionally not wired here.) +# +# Usage: qemu-smoke.sh [app.elf] +# Env: QEMU_TIMEOUT (seconds, default 120), UART_LOG (default uart-.log) +# +set -u + +BOARD="${1:?usage: qemu-smoke.sh [app.elf]}" +ELF="${2:-src/port/amd/boards/$BOARD/app.elf}" +TIMEOUT="${QEMU_TIMEOUT:-120}" +LOG="${UART_LOG:-uart-$BOARD.log}" + +if [ ! -f "$ELF" ]; then + echo "ERROR: app.elf not found: $ELF (build it first)" >&2 + exit 2 +fi + +# Per-board QEMU machine + console UART routing. The console UART differs per +# board (zcu102 = PS-UART0 = serial0; versal = PL011 = serial0; zynq7000 = +# UART1 = serial1, so serial0 is routed to null). +case "$BOARD" in + zcu102) + QEMU=qemu-system-aarch64 + MACHINE="xlnx-zcu102,secure=on" + SERIAL=(-serial "mon:stdio") + ;; + versal) + QEMU=qemu-system-aarch64 + MACHINE="xlnx-versal-virt" + SERIAL=(-serial "mon:stdio") + ;; + zynq7000) + QEMU=qemu-system-arm + MACHINE="xilinx-zynq-a9" + SERIAL=(-serial null -serial "mon:stdio") + ;; + *) + echo "ERROR: unknown board '$BOARD'" >&2 + exit 2 + ;; +esac + +if ! command -v "$QEMU" >/dev/null 2>&1; then + echo "ERROR: $QEMU not found (install qemu-system-arm / qemu-system-aarch64)" >&2 + exit 2 +fi + +echo "=== QEMU smoke: $BOARD ($QEMU -M $MACHINE), elf=$ELF, timeout=${TIMEOUT}s ===" +: > "$LOG" + +# Bare-metal load: -device loader sets PC to the ELF entry at the machine's +# reset EL (EL3 on the AArch64 machines, SVC on zynq-a9). No netdev is attached: +# the gate does not need networking, and an unconsumed -netdev would error out. +"$QEMU" -M "$MACHINE" -nographic -no-reboot \ + "${SERIAL[@]}" \ + -device "loader,file=$ELF,cpu-num=0" \ + >>"$LOG" 2>&1 & +QPID=$! + +ok=0 +fault=0 +deadline=$((SECONDS + TIMEOUT)) +while kill -0 "$QPID" 2>/dev/null; do + if grep -qa "Ready" "$LOG"; then ok=1; break; fi + # Hard-fault / abort markers from the exception vectors or QEMU itself. + if grep -qaiE "synchronous exception|unhandled|abort|panic" "$LOG"; then + fault=1; break + fi + if [ "$SECONDS" -ge "$deadline" ]; then break; fi + sleep 2 +done + +kill "$QPID" 2>/dev/null +wait "$QPID" 2>/dev/null || true + +echo "----- captured UART ($LOG) -----" +cat "$LOG" +echo "--------------------------------" + +if [ "$ok" -eq 1 ]; then + echo "PASS: $BOARD reached 'Ready' under QEMU" + exit 0 +fi +if [ "$fault" -eq 1 ]; then + echo "FAIL: $BOARD hit a fault/abort marker before 'Ready'" >&2 + exit 1 +fi +echo "FAIL: $BOARD did not reach 'Ready' within ${TIMEOUT}s" >&2 +exit 1 diff --git a/tools/scripts/zcu102/README.md b/tools/scripts/zcu102/README.md new file mode 100644 index 00000000..7a3edc59 --- /dev/null +++ b/tools/scripts/zcu102/README.md @@ -0,0 +1,62 @@ +# ZCU102 JTAG bare-metal loader + +`jtag_load.tcl` is a generic AArch64 bare-metal JTAG loader for the +Xilinx ZCU102 (ZynqMP Cortex-A53 EL3). It lets you iterate on +bare-metal firmware without swapping the SD card. + +The src/port/amd/boards/zcu102/ directory has a wolfIP-specific wrapper around +this same pattern at `src/port/amd/boards/zcu102/jtag/boot.tcl`; this directory +holds the standalone reference so the pattern can be cloned into +other wolfSSL projects (wolfBoot, wolfTPM, wolfHSM, etc.) targeting +the same SoC. + +## Usage + +```sh +source /opt/Xilinx/2025.2/Vitis/settings64.sh + +# Build and produce a flat binary for the loader. +aarch64-none-elf-objcopy -O binary myapp.elf myapp.bin + +APP_ELF=$PWD/myapp.elf \ +APP_BIN=$PWD/myapp.bin \ +FSBL_PSU_INIT_TCL=/path/to/petalinux/hw-description/psu_init.tcl \ +xsdb tools/scripts/zcu102/jtag_load.tcl +``` + +ZCU102 must be in JTAG boot mode (SW6 = all ON). The loader expects +hw_server already running on localhost (Vitis starts it by default). + +## What it does + +1. `rst -system`, then `mwr 0xFF5E0200 0x0100` to force CSU JTAG bootmode +2. `psu_init` + `psu_post_config` to bring DDR / clocks / MIO / UART up +3. Re-initialize UART0 baud (psu_init alone doesn't always finish this) +4. Load `APP_BIN` word-by-word via `mwr -force` to OCM (0xFFFC0000) +5. Install a `b .` bootloop at the default RVBAR (0xFFFF0000) +6. `rst -processor` + `stop` + `rwr pc ` + `con` + +## Constraints + +- App `.text` + `.rodata` + `.data` must fit in OCM (256 KiB). +- App `.bss`, page tables, DMA buffers go in DDR, **above 0x10000** + (the first 16 KiB of DDR has a JTAG-DAP alias bug; avoid). +- MMU page tables must map the OCM 2 MiB block (entry 511 of an L2 + covering 0xC0000000..0xFFFFFFFF) as Normal + executable. Otherwise + `mmu_enable` faults on the next instruction fetch. + +## Five traps this loader avoids + +The corresponding wolfIP-specific loader at `src/port/amd/boards/zcu102/jtag/boot.tcl` +has inline comments at each step. The traps are: + +1. DDR DAP 16-KiB alias at low addresses (use OCM). +2. MMU L1 needs OCM carved out as Normal+exec (not Device+XN). +3. CSU JTAG bootmode bit must be written before psu_init. +4. `dow` to DDR breaks after psu_init - use `mwr -force` per word. +5. RVBAR bootloop at 0xFFFF0000 lets `rst -processor` be safe. + +## Related + +- `src/port/amd/boards/zcu102/jtag/boot.tcl` -- wolfIP-specific instance +- `tools/scripts/zynq7000/jtag_load.tcl` in `wolfBoot` -- ARMv7 analog diff --git a/tools/scripts/zcu102/jtag_load.tcl b/tools/scripts/zcu102/jtag_load.tcl new file mode 100644 index 00000000..5800b243 --- /dev/null +++ b/tools/scripts/zcu102/jtag_load.tcl @@ -0,0 +1,159 @@ +# jtag_load.tcl - generic AArch64 JTAG bare-metal loader for ZCU102. +# +# Source-agnostic: works for any AArch64 EL3 bare-metal ELF whose +# loadable text + vectors fit in OCM (0xFFFC0000, 256 KiB). +# BSS / page-tables / DMA buffers can live in DDR; they get zeroed +# by the app's own startup code, so it doesn't matter that DDR has +# a JTAG-DAP 16-KiB alias bug at low addresses. +# +# Usage: +# source /opt/Xilinx/2025.2/Vitis/settings64.sh # for xsdb on PATH +# FSBL_PSU_INIT_TCL=/path/to/psu_init.tcl \ +# APP_ELF=/path/to/app.elf \ +# APP_BIN=/path/to/app.bin \ +# xsdb tools/scripts/zcu102/jtag_load.tcl +# +# Set the ZCU102 SW6 boot-mode straps to ALL ON (JTAG mode 0000) +# and power the board on before running. +# +# This pattern was distilled from a working Xilinx PUF-provision +# JTAG loader. See src/port/amd/boards/zcu102/README.md and the comments in +# src/port/amd/boards/zcu102/jtag/boot.tcl for the full set of traps this +# loader is built to avoid. + +set OCM_BASE 0xFFFC0000 + +if {![info exists ::env(APP_ELF)] || ![info exists ::env(APP_BIN)] \ + || ![info exists ::env(FSBL_PSU_INIT_TCL)]} { + puts "Usage: APP_ELF=... APP_BIN=... FSBL_PSU_INIT_TCL=... xsdb $argv0" + exit 1 +} +foreach var {APP_ELF APP_BIN FSBL_PSU_INIT_TCL} { + if {![file exists $::env($var)]} { + puts "ERROR: $var = $::env($var) not found" + exit 1 + } +} + +# --------------------------------------------------------------------- +# Load a flat binary file to a target address via mwr -force, one 32- +# bit word at a time. Slow but reliable - bypasses xsdb's cache +# coherency logic which is broken on DDR after psu_init. +# --------------------------------------------------------------------- +proc load_binary {bin_file base_addr} { + set fp [open $bin_file rb] + set data [read $fp] + close $fp + set len [string length $data] + + set pad [expr {(4 - ($len % 4)) % 4}] + if {$pad > 0} { append data [string repeat "\x00" $pad] } + set words [expr {[string length $data] / 4}] + + puts " loading $len bytes ($words words) to [format 0x%08X $base_addr]" + + targets -set -nocase -filter {name =~ "*PSU*"} + for {set i 0} {$i < $words} {incr i} { + set off [expr {$i * 4}] + binary scan $data @${off}iu word + mwr -force [format "0x%X" [expr {$base_addr + $off}]] \ + [format "0x%X" [expr {$word & 0xFFFFFFFF}]] + if {($i % 8192) == 0 && $i > 0} { + puts " [expr {$i * 100 / $words}]%..." + } + } + puts " 100% done" +} + +# --------------------------------------------------------------------- +# 1. Connect, system reset, force CSU JTAG bootmode. +# --------------------------------------------------------------------- +puts "Connecting..." +connect +puts "All targets:" +targets + +targets -set -nocase -filter {name =~ "*PSU*"} +puts "System reset..." +rst -system +after 500 + +puts "Forcing JTAG boot mode (CSU 0xFF5E0200 <- 0x0100)..." +mwr 0xFF5E0200 0x0100 +after 1000 + +# --------------------------------------------------------------------- +# 2. psu_init (DDR, clocks, MIO, UART, GEM). +# --------------------------------------------------------------------- +puts "Sourcing psu_init.tcl..." +source $::env(FSBL_PSU_INIT_TCL) +puts "psu_init..." +psu_init +after 1000 +puts "psu_post_config..." +psu_post_config +after 500 + +# --------------------------------------------------------------------- +# 3. UART0 baud init at 115200 8N1 (100 MHz ref / 124 / 7 = 115207). +# --------------------------------------------------------------------- +puts "UART0 baud init..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF000000 0x03 ;# CR: TX_RST + RX_RST +mwr 0xFF000004 0x20 ;# MR: 8N1 +mwr 0xFF000018 124 ;# BAUDGEN.CD = 124 +mwr 0xFF000034 6 ;# BAUDDIV.BDIV = 6 +mwr 0xFF000000 0x114 ;# CR: TXEN + RXEN + STPBRK +after 100 + +foreach c [split "=== JTAG ready, loading app ===\r\n" ""] { + scan $c %c v + mwr -force 0xFF000030 $v +} +after 200 + +# --------------------------------------------------------------------- +# 4. Load the app binary into OCM. +# --------------------------------------------------------------------- +puts "" +puts "Loading: $::env(APP_BIN) at [format 0x%08X $OCM_BASE]" +load_binary $::env(APP_BIN) $OCM_BASE + +# --------------------------------------------------------------------- +# 5. Install b . boot loop at default RVBAR_EL3 (0xFFFF0000). +# --------------------------------------------------------------------- +puts "" +puts "Installing RVBAR boot loop at 0xFFFF0000..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr -force 0xFFFF0000 0x14000000 ;# B . (aarch64 self-branch) +mwr -force 0xFFFF0004 0x14000000 + +# --------------------------------------------------------------------- +# 6. A53 #0: reset, halt, set PC, continue. +# --------------------------------------------------------------------- +puts "" +puts "Preparing A53 #0..." +targets -set -nocase -filter {name =~ "*A53*#0"} +rst -processor +after 200 +catch {stop} +after 200 +puts "PC after rst -processor (expect 0xFFFF0000): [rrd pc]" + +# Pull entry point from the ELF. +set readelf "aarch64-none-elf-readelf" +if {[info exists ::env(READELF)]} { set readelf $::env(READELF) } +set entry [exec $readelf -h $::env(APP_ELF) \ + | grep "Entry point" | awk "{print \$NF}"] +puts "App ELF entry: $entry" +rwr pc $entry +puts "PC after rwr: [rrd pc]" + +puts "" +puts "Continuing app..." +con + +after 500 +puts "Detached. App is running." +disconnect +exit