1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
use crate::processor::isa_mods::vector::VecRegInterface; use crate::processor::isa_mods::PossibleXlen; use super::types::*; use crate::processor::decode::{Opcode,InstructionBits}; use anyhow::Result; /// Memory OPeration enum /// /// Vector Load/Store operations have four variants: /// /// - Unit-Stride, e.g. access contiguous memory, which has special-case versions (see [UnitStrideLoadOp, UnitStrideStoreOp]) /// - Variable Stride /// - Indexed, which can be Ordered or Unordered #[derive(Debug,PartialEq,Eq,Clone,Copy)] enum RvvMopType { UnitStride, Strided(u64), Indexed{ordered: bool}, } /// Special variants of vector loads with unit-stride #[derive(Debug,PartialEq,Eq,Clone,Copy)] pub enum UnitStrideLoadOp { Load, WholeRegister, ByteMaskLoad, FaultOnlyFirst } /// Special variants of vector stores with unit-stride #[derive(Debug,PartialEq,Eq,Clone,Copy)] pub enum UnitStrideStoreOp { Store, WholeRegister, ByteMaskStore } /// The "direction" of a memory operation. /// Used by [DecodedMemOp]. #[derive(Debug,PartialEq,Eq,Clone,Copy)] pub enum MemOpDir { /// Load = taking values from memory and putting them in vector registers Load, /// Store = taking values from vector registers and putting them in memory Store } /// The different kinds of RISC-V V vector loads/stores. /// One top-level enum which encapsulates Strided access (also used for basic unit-stride access), /// Indexed access, and the special cases of unit-stride access (e.g. whole-register, bytemasked, fault-only-first). #[derive(Debug,PartialEq,Eq,Clone,Copy)] pub enum DecodedMemOp { /// Moves elements of [Self::Strided::nf] vector register groups to/from contiguous segments of memory, /// where each segment is separated by a stride. /// /// - The start of each segment is separated by [Self::Strided::stride] bytes. /// - Each segment is `nf * eew` bits long, i.e. [Self::Strided::nf] elements long. /// - Each element in the i-th segment maps to the i-th element of a vector register group. /// - This instruction doesn't do anything if the stored `vstart >= vl`. /// /// In the simplest case, `nf = 1`. /// For example: `stride = 8`, `eew = 32 bits = 4 bytes` /// ```text /// base addr + (i * 8) <=> v0[i] /// ``` /// /// Increasing [Self::Strided::nf] makes it more complicated. /// For example if `nf = 3`, `stride = 8`, `eew = 32 bits = 4 bytes`: /// ```text /// base addr + (i * 8) + (0 * 4) <=> v0[i] /// base addr + (i * 8) + (1 * 4) <=> v1[i] /// base addr + (i * 8) + (2 * 4) <=> v2[i] /// ``` /// /// In the most complicated case, [Self::Strided::emul] may also be > 1. /// If `EMUL = 2`, `nf = 3`, `stride = 8`, `eew = 32 bits = 4 bytes`: /// ```text /// base addr + (i * 8) + (0 * 4) <=> (v0..v1)[i] /// base addr + (i * 8) + (1 * 4) <=> (v2..v3)[i] /// base addr + (i * 8) + (2 * 4) <=> (v4..v5)[i] /// ``` /// Element 2 of the segment maps to vector register *group* 2, /// i.e. v4 and v5, rather than v2. Strided{ /// The stride, specified in bytes /// TODO make this signed everywhere stride: u64, /// The direction, i.e. load or store dir: MemOpDir, /// The effective element width - this is encoded in the instruction instead of copying from vtype eew: Sew, /// The effective LMUL of the operation, e.g. the size of the vector register group. /// Computed as (EEW/vtype.SEW)*vtype.LMUL /// /// AFAIK this is to keep the Effective Vector Length (EVL) the same based on the element width. /// For example, if you set `vtype = (SEW = 32, LMUL = 1) and vl = 4` to prepare for 32-bit arithmetic, /// and then load 4x 64-bit elements (EEW = 64), the effective LMUL of the load will double to make room. emul: Lmul, /// The effective vector length - always equal to the current vl evl: u32, /// Number of Fields for segmented access nf: u8, }, /// Moves elements of [Self::Indexed::nf] vector register groups to/from contiguous segments of memory, /// where each segment is offset by an index taken from another vector. /// /// - The start of each segment is defined by `base address + index_vector[i]`. /// - Each segment is `nf * eew` bits long, i.e. [Self::Indexed::nf] elements long. /// - Each element in the i-th segment maps to the i-th element of a vector register group. /// - Accesses within each segment are not ordered relative to each other. /// - If the ordered variant of this instruction is used, then the segments must be accessed in the order specified by the index vector. /// - This instruction doesn't do anything if the stored `vstart >= vl`. /// /// The EEW and EMUL for the elements themselves are equal to the SEW, LMUL stored in `vtype`. /// The EEW and EMUL for the indices are defined in the instruction. /// /// In the simplest case, `nf = 1`. /// For example: /// ```text /// base addr + index_vector[i] <=> v0[i] /// ``` /// /// Increasing [Self::Indexed::nf] makes it more complicated. /// For example if `nf = 3`, `element width = 32 bits = 4 bytes`: /// ```text /// base addr + index_vector[i] + (0 * 4) <=> v0[i] /// base addr + index_vector[i] + (1 * 4) <=> v1[i] /// base addr + index_vector[i] + (2 * 4) <=> v2[i] /// ``` /// /// In the most complicated case, [Self::Indexed::emul] may also be > 1. /// If `EMUL = 2`, `nf = 3`, `element width = 32 bits = 4 bytes`: /// ```text /// base addr + index_vector[i] + (0 * 4) <=> (v0..v1)[i] /// base addr + index_vector[i] + (1 * 4) <=> (v2..v3)[i] /// base addr + index_vector[i] + (2 * 4) <=> (v4..v5)[i] /// ``` /// Element 2 of the segment maps to vector register *group* 2, /// i.e. v4 and v5, rather than v2. Indexed{ /// Whether elements must be accessed in the order specified by the index vector. ordered: bool, /// The width of the indices. Indices are byte offsets. index_ew: Sew, /// The effective LMUL for the indices. index_emul: Lmul, /// The direction, i.e. load or store dir: MemOpDir, /// The width of the elements being accessed from memory eew: Sew, /// The effective LMUL of the elements being accessed from memory. See [DecodedMemOp::Strided::emul]. emul: Lmul, /// The effective vector length - always equal to the current vl evl: u32, /// Number of Fields for segmented access nf: u8, }, /// Moves the contents of [Self::WholeRegister::nf] vector registers to/from a contiguous range in memory. WholeRegister{ /// The direction, i.e. load or store dir: MemOpDir, /// The number of registers to load or store. /// Encoded the same way as `nf` for other instructions. /// Must be power-of-2 num_regs: u8, /// The width of the elements being accessed. /// This doesn't impact the result, but Load variants of this instruction exist for each type eew: Sew, }, /// Moves the contents of a mask register to/from a contiguous range of memory. /// /// This instruction transfers at least `vl` bits into the mask register, /// one bit for each element that could be used in subsequent vector instructions. /// /// It is therefore equivalent to a unit-stride load where /// - EVL = `ceil(vl/8)` /// - EEW = 8-bits /// - EMUL = 1 (The maximum LMUL is 8, thus `vl/8` bytes must be able to fit into a single vector register) /// - the tail-agnostic setting is always on ByteMask{ /// The direction, i.e. load or store dir: MemOpDir, /// The number of bytes to load, i.e. `ceil(vl/8)` evl: u32 }, /// Loads elements from contiguous segments in memory into [Self::FaultOnlyFirst::nf] vector register groups. /// If an exception is encountered while loading elements from segment 0, it is trapped as usual. /// However, an exception encountered after that point is ignored, and `vl` is set to the current segment instead. /// /// - The start of the range is defined by `base address`. /// - Each segment is `nf * eew` bits long, i.e. [Self::FaultOnlyFirst::nf] elements long. /// - Each element in the i-th segment maps to the i-th element of a vector register group. /// - Accesses within each segment are not ordered relative to each other. /// - This instruction doesn't do anything if the stored `vstart >= vl`. /// /// The mappings of address to element are the same as for [DecodedMemOp::Strided], where the stride = the element width. /// /// ```text /// These accesses can trap an exception /// base addr + (0 * 8) + (0 * 4) <=> (v0..v1)[0] /// base addr + (0 * 8) + (1 * 4) <=> (v2..v3)[0] /// base addr + (0 * 8) + (2 * 4) <=> (v4..v5)[0] /// /// These accesses set vl = i on an exception, where i != 0 /// base addr + (i * 8) + (0 * 4) <=> (v0..v1)[i] /// base addr + (i * 8) + (1 * 4) <=> (v2..v3)[i] /// base addr + (i * 8) + (2 * 4) <=> (v4..v5)[i] /// ``` FaultOnlyFirst{ /// The width of the elements being accessed from memory eew: Sew, /// The effective LMUL of the operation. See [DecodedMemOp::Strided::emul]. emul: Lmul, /// The effective vector length - always equal to the current vl evl: u32, /// Number of Fields for segmented access nf: u8, }, } impl DecodedMemOp { pub fn dir(&self) -> MemOpDir { use DecodedMemOp::*; match *self { Strided{dir, ..} => dir, Indexed{dir, ..} => dir, WholeRegister{dir, ..} => dir, ByteMask{dir, ..} => dir, FaultOnlyFirst{..} => MemOpDir::Load, } } pub fn evl(&self) -> u32 { use DecodedMemOp::*; match *self { Strided{evl, ..} => evl, Indexed{evl, ..} => evl, WholeRegister{num_regs, eew, ..} => num_regs as u32 * (VLEN as u32/8)/(eew.width_in_bytes() as u32), ByteMask{evl, ..} => evl, FaultOnlyFirst{evl, ..} => evl, } } fn _get_encoded_emul_eew_nf(inst: InstructionBits, current_vtype: VType) -> Result<(Lmul, Sew, u8)> { if let InstructionBits::FLdStType{width, nf, mew, ..} = inst { // The full width = the mew bit + original width let width = if mew { 1 << 3 } else { 0 } | width; // Get the element width we want to use (which is NOT the same as the one encoded in vtype) // EEW = Effective Element Width let eew_num = match width { 0b0001 | 0b0010 | 0b0011 | 0b0100 => bail!("LoadFP uses width for normal floats, not vectors"), 0b1001..=0b1111 => bail!("LoadFP using reserved width {}", width), 0b0000 => 8, 0b0101 => 16, 0b0110 => 32, 0b0111 => 64, // Reserved by the spec, 128-bit identifier has not been explicitly identified 0b1000 => 128, _ => bail!("LoadFP has impossible width {}", width) }; // Check the effective element width is valid, given the current SEW and LMUL // EMUL = Effective LMUL // because LMULs can be as small as 1/8th, evaluate it as an integer * 8 (effectively 29.3 fixed point) let emul_times_8 = current_vtype.val_times_lmul_over_sew(eew_num * 8); // Limit EMUL to the same values as LMUL if emul_times_8 > 64 || emul_times_8 < 1 { bail!("emul * 8 too big or too small: {}", emul_times_8); } // NF = Number of Fields // If NF > 1, it's a *segmented* load/store // where "packed contiguous segments" are moved into "multiple destination vector register groups" // For example // a0 => rgbrgbrgbrgbrgb (24-bit pixels, 8-bits-per-component) // vlseg3e8 v8, (a0) ; NF = 3, EEW = 8 // -> v8 = rrrr // v9 = gggg // v10 = bbbb let nf = nf + 1; // EMUL * NF = number of underlying registers in use // => EMUL * NF should be <= 8 if (emul_times_8 * (nf as u32)) > 64 { bail!("emul * nf too big: {}", emul_times_8 * (nf as u32) / 8); } // Convert EEW, EMUL to enums let eew = match eew_num { 8 => Sew::e8, 16 => Sew::e16, 32 => Sew::e32, 64 => Sew::e64, 128 => Sew::e128, _ => bail!("Impossible EEW {}", eew_num) }; let emul = match emul_times_8 { 1 => Lmul::eEighth, 2 => Lmul::eQuarter, 4 => Lmul::eHalf, 8 => Lmul::e1, 16 => Lmul::e2, 32 => Lmul::e4, 64 => Lmul::e8, _ => bail!("Impossible EMUL-times-8 {}", emul_times_8) }; return Ok((emul, eew, nf)); } else { unreachable!("get_encoded_emul_eew_nf MUST be passed an instruction of FLdStType, got {:?}", inst) } } /// Decode a Load/Store opcode into an DecodedMemOp structure. /// Performs all checks to ensure the instruction is a valid RISC-V V vector load/store. pub fn decode_load_store<uXLEN: PossibleXlen>(opcode: Opcode, inst: InstructionBits, current_vtype: VType, current_vl: u32, sreg: &mut dyn VecRegInterface<uXLEN>) -> Result<DecodedMemOp> { if let InstructionBits::FLdStType{rs2, mop, ..} = inst { let dir = match opcode { Opcode::LoadFP => MemOpDir::Load, Opcode::StoreFP => MemOpDir::Store, _ => bail!("Incorrect opcode passed to decode_load_store: {:?}", opcode) }; let (emul, eew, nf) = DecodedMemOp::_get_encoded_emul_eew_nf(inst, current_vtype)?; // As per section 7.4, evl for ByteMask operations = ceil(vl/8) // We don't have div_ceil in Rust yet, so do (vl + 7) / 8 which is equivalent let bytemask_vl = (current_vl + 7) / 8; // WholeRegister operations only support pow2 nfs let nf_pow2 = match nf { 1 | 2 | 4 | 8 => true, _ => false }; // MOP = Memory OPeration // Determines indexing mode let mop = match mop { 0b00 => RvvMopType::UnitStride, 0b10 => RvvMopType::Strided(sreg.sreg_read_xlen(rs2)?.into()), 0b01 => RvvMopType::Indexed{ordered: false}, 0b11 => RvvMopType::Indexed{ordered: true}, _ => panic!("impossible mop bits {:2b}", mop) }; let decoded_mop = match mop { RvvMopType::UnitStride => match dir { MemOpDir::Load => { use UnitStrideLoadOp::*; let lumop = match rs2 { 0b00000 => Load, 0b01000 => WholeRegister, 0b01011 => ByteMaskLoad, 0b10000 => FaultOnlyFirst, _ => bail!("invalid unit stride type {:05b}", rs2) }; match lumop { Load => DecodedMemOp::Strided{ // Unit-stride = segments are packed together // => each segment is (element_width * fields_per_segment) bytes apart // => eew * nf stride: eew.width_in_bytes() * (nf as u64), dir, eew, emul, nf, evl: current_vl, }, WholeRegister => if nf_pow2 { DecodedMemOp::WholeRegister{ dir, eew, num_regs: nf, } } else { bail!("WholeRegister operation with non-power2 nf {} impossible", nf); }, ByteMaskLoad => if eew == Sew::e8 { DecodedMemOp::ByteMask { dir, evl: bytemask_vl, } } else { bail!("Can't have ByteMaskLoad with non-byte EEW") }, FaultOnlyFirst => DecodedMemOp::FaultOnlyFirst{ eew, emul, evl: current_vl, nf, }, } }, MemOpDir::Store => { use UnitStrideStoreOp::*; let sumop = match rs2 { 0b00000 => Store, 0b01000 => WholeRegister, 0b01011 => ByteMaskStore, _ => bail!("invalid unit stride type {:05b}", rs2) }; match sumop { Store => DecodedMemOp::Strided{ // Unit-stride = segments are packed together // => each segment is (element_width * fields_per_segment) bytes apart // => eew * nf stride: eew.width_in_bytes() * (nf as u64), dir, eew, emul, nf, evl: current_vl, }, WholeRegister => if nf_pow2 { if eew != Sew::e8 { bail!("WholeRegister operation with EEW {:?} != e8 is impossible", eew); } DecodedMemOp::WholeRegister{ dir, eew, num_regs: nf, } } else { bail!("WholeRegister operation with non-power2 nf {} impossible", nf); }, ByteMaskStore => if eew == Sew::e8 { DecodedMemOp::ByteMask { dir, evl: bytemask_vl, } } else { bail!("Can't have ByteMaskStore with non-byte EEW") }, } }, } RvvMopType::Strided(stride) => DecodedMemOp::Strided{ stride, dir, eew, emul, nf, evl: current_vl, }, RvvMopType::Indexed{ordered} => if eew != Sew::e128 { DecodedMemOp::Indexed{ ordered, index_ew: eew, index_emul: emul, eew: current_vtype.vsew, emul: current_vtype.vlmul, dir, evl: current_vl, nf, } } else { bail!("Indexed operations with 128-bit elements not tested, may not make sense (what happens if >64-bits on a 64-bit machine?)") } }; return Ok(decoded_mop); } else { bail!("decode_load_store MUST be passed an instruction of FLdStType, got {:?}", inst) } } }