Skip to main content

prjunnamed_memory/
lib.rs

1//! This library contains various common utilities for writing target-specific memory lowering passes.
2//!
3//! Target-specific memory lowering passes are organized roughly as follows:
4//!
5//! - for each memory in the design:
6//!   - a list of candidate lowerings is assembled:
7//!     - if a lowering mode (ff / lutram / blockram / hugeram) is explicitly selected, only the relevant
8//!       target cells types are considered
9//!     - for each available type of target cell:
10//!       - hard requirements are checked, and the target cell is rejected if they are unmet, eg:
11//!         - memories with more write ports than the target cell are rejected
12//!         - memories with asynchronous read ports are rejected if not supported by target cell
13//!       - a set of candidate "swizzle" transformations is assembled, for rearranging the memory geometry
14//!         to match the target cell
15//!         - the candidate swizzles are created according to target-specific rules, with some help from
16//!           common functions for eg. legalizing write mask granularity
17//!       - the best swizzle is selected, according to some metric (such as inserted soft mux depth or number
18//!         of used target cells)
19//!       - when a memory needs to be split/duplicated because of excessive read port count, each of the
20//!         split/duplicated sub-memories will have its own swizzle
21//!       - if a swizzle is egregiously bad (uses more resources than the entire FPGA has available), it is
22//!         rejected
23//!     - if applicable, a "fallback" lowering candidate is inserted into the list of candidates
24//!       - no fallback lowering if multiple write port domains present
25//!       - lowering to FFs for RAMs (very expensive)
26//!       - lowering to LUTs for ROMs (somewhat expensive)
27//!   - if no candidates are available, synthesis is aborted with a fatal error
28//!   - one of the candidate lowerings is picked, according to some target-specific heuristic
29//!     - initial heuristic will simply switch between target cells based on memory depth
30//!     - in the future, we will want to take target resource counts into account (moving the depth threshold
31//!       to avoid overfilling the FPGA)
32//!   - if the "fallback" lowering candidate has been selected, the common lowering code is called to perform it
33//!   - otherwise, target cell lowering is performed:
34//!     - the memory is split into sub-memories (using common code), as required to satisfy max read port count,
35//!       then each sub-memory is processed independently
36//!     - various fixup transformations (implemented in common code, but driven by target code) are performed on
37//!       the memory to match it with target cell capabilities:
38//!       - sync read ports are converted to async if required
39//!       - read port enable / clear / reset / initial value are unmapped and emulated if not supported
40//!       - transparency is unmapped and emulated if not supported
41//!       - read-first relations are emulated if not supported
42//!     - the swizzle (computed earlier) is applied to the memory (using common code), aligning its geometry
43//!       with the target cell
44//!     - the memory is consumed and converted into actual target cells
45//!     - any target-specific post-processing (such as merging output pipeline registers) is performed
46
47use std::collections::{btree_map, BTreeMap};
48
49use prjunnamed_netlist::{
50    Const, ControlNet, Design, FlipFlop, Memory, MemoryPortRelation, MemoryReadFlipFlop, MemoryReadPort,
51    MemoryWritePort, Net, Trit, Value,
52};
53
54/// Describes a "swizzle" transformation, used to align memory geometry to target requirements.
55///
56/// This structure is constructed by the target, with some help from common code.
57///
58/// The swizzle transformation involves the following steps:
59///
60/// 1. The data swizzle is applied to the write data, write mask, and read data signals.  This is used
61///    to insert dummy data bits to ensure that target cell's write mask granularity requirements are met.
62///
63/// 2. The address bits selected by `soft_addr_bits_mask` are removed from all ports, and implemented using soft
64///    logic.  This has the following effects:
65///
66///    - memory depth is divided by two (for each removed address bit)
67///    - memory width is multiplied by two (for each removed address bit)
68///    - for each write port:
69///      - if the removed address bit was implicit (as part of a wide port), the port width doesn't change
70///      - otherwise, the port width is multiplied by two, and a mux is inserted
71///    - for each write port:
72///      - if the removed address bit was implicit (as part of a wide port), the port width doesn't change
73///      - otherwise, the port width is multiplied by two, and write mask is adjusted with address decoding logic
74///
75///    This part of the transformation is used to deal with memories that have excessively asymmetric port widths.
76///
77/// 3. The `wide_log2` of every write port is adjusted up to the factor requested in `write_wide_log2`, by doubling
78///    port width and inserting write address decode logic as necessary.  It is an error if the requested factor
79///    is smaller than the current one (after applying step #2).
80///
81///    This is used for funny cases like iCE40 memories, which require using 16-bit write ports to have per-bit
82///    write masks.
83///
84/// 4. The `wide_log2` of every read port is adjusted up to the factor requested in `read_wide_log2`, by doubling
85///    port width and inserting muxes as necessary.  It is an error if the requested factor is smaller than the
86///    current one (after applying step #2).
87///
88///    This is used for obscure cases requiring a *minimum* port width asymmetry such as the Xilinx `RAM64X8SW` cell.
89///
90/// 5. The memory depth is adjusted to be at most `2 ** hard_addr_bits`.  Further, all read and write ports are
91///    adjusted to have an address signal that is exactly `hard_addr_bits - port.wide_log2()` bits.
92///
93///    If the memory has larger depth than this limit, it is split up into `2 ** hard_addr_bits`-deep chunks,
94///    the chunks are assembled "side-by-side" (converting depth to width), and appropriate read multiplexers
95///    and write address decoders are inserted.
96///
97/// 6. The memory is cut along its width, into `data_width_unit`-sized chunks, which are returned as individual
98///    [`Memory`] structures.  The last chunk is padded with dummy bits.
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct MemorySwizzle {
101    /// The data swizzle.  In the first step, the data width of the memory is adjusted to the length of this
102    /// vector.  Each bit of the new memory width is described by an entry of this vector.  A `Some(idx)` means
103    /// that the bit should correspond to the given bit slice in the original memory, while a `None` means
104    /// a dummy bit inserted for legalization purposes.
105    pub data_swizzle: Vec<Option<usize>>,
106    pub soft_addr_bits_mask: u32,
107    pub write_wide_log2: Vec<usize>,
108    pub read_wide_log2: Vec<usize>,
109    pub hard_addr_bits: usize,
110    pub data_width_unit: usize,
111}
112
113/// An extension trait for `Memory` with assorted memory lowering utility functions.
114pub trait MemoryExt {
115    /// Returns a new memory, with the same dimensions and write ports as this one, but only the selected
116    /// subset of read ports.
117    ///
118    /// `output` should be the output value of the memory.  This function will slice out the bits corresponding
119    /// to the selected read ports, and return them.
120    fn extract_read_ports(&self, ports: &[usize], output: &Value) -> (Memory, Value);
121
122    /// Converts a synchronous read port to an asynchronous read port, by effectively extracting a flip-flop.
123    /// If transparency relations are involved, soft transparency logic is inserted.
124    ///
125    /// `output` is the output value of the memory cell.  This function performs a `replace_value` on the bits
126    /// corresponding to the converted port, and changes them in-place in the argument to newly-added void nets
127    /// that are to be driven by the newly-converted port.  Thus, the original memory cell must be unalived, and
128    /// `replace_value` must be called on the new value of `output`, possibly after further lowering.
129    ///
130    /// If the given read port is already asynchronous, the function will return without doing anything.
131    fn unmap_read_dff(&mut self, design: &Design, port_index: usize, output: &mut Value);
132
133    /// Unmaps the following features from a synchronous read port:
134    ///
135    /// - reset
136    /// - clear
137    /// - initial value
138    /// - if `include_transparency` is true, all `MemoryPortRelation::Transparent` relations
139    ///
140    /// If none of the above are used on the given port, the function will return without doing anything.
141    ///
142    /// Note that, if any of the above features is unmapped, all of reset, clear, and initial value can be unmapped
143    /// for free (it would actually be more complex to *not* unmap them).  Thus, we do not have granular controls
144    /// for them.
145    ///
146    /// This function must not be called on asynchronous read ports.
147    fn unmap_read_init_reset_transparency(
148        &mut self,
149        design: &Design,
150        port_index: usize,
151        include_transparency: bool,
152        output: &mut Value,
153    );
154
155    // TODO: needs a `can_emulate_read_before_write()`
156
157    /// Emulates all read-before-write relations in the memory.
158    ///
159    /// This function has strict preconditions:
160    ///
161    /// - all read ports must be synchronous and have the same clock
162    /// - a single write port cannot have both read-before-write and transparent relations
163    ///
164    /// If there are no read-before-write relations in the memory, this function will return without doing anything.
165    ///
166    /// The read-before-write relations are emulated by delaying all write ports with read-before-write relations
167    /// by half a cycle:  the clock polarity of the relevant write ports is flipped, and flip-flops (clocked by
168    /// the original clock) are added on the data, mask, and address inputs.
169    fn emulate_read_before_write(&mut self, design: &Design);
170
171    /// Constructs a `data_swizzle` for the [`MemorySwizzle`] structure, ensuring that the resulting memory
172    /// will satisfy mask granularity constraints for all write ports.  The argument must contain one number
173    /// for each write port, specifying its mask granularity in bits.  For each write port, the result will
174    /// satisfy the following constraints:
175    ///
176    /// - the result's length will be divisible by the specified granularity
177    /// - every aligned group of `granularity` bits will consist only of bits that share the same `mask` net
178    ///   in the write port; if the write port is wide, this rule applies individually to all of its subports
179    fn make_data_swizzle(&self, write_port_granularity: &[usize]) -> Vec<Option<usize>>;
180
181    /// Returns the depths of the memories that would be returned by `swizzle`, for cost estimation purposes.
182    /// Each entry in the result vector corresponds to one memory.
183    ///
184    /// For the simple case of the target mapping every memory to a single target cell, the length
185    /// of the returned vector is equal to the number of target cells used.  For more advanced mappings
186    /// where the target can lower a memory to multiple target cells (e.g. where hardware cascade or chip
187    /// selects are involved), the target can look at the exact memory depths required and perform its own
188    /// computations.
189    fn swizzle_depths(&self, swizzle: &MemorySwizzle) -> Vec<usize>;
190
191    /// Returns the number of soft-decoded address bits for each write port and read port for a given `swizzle`.
192    ///
193    /// Assumes that the swizzle applies to a memory where `extract_read_ports` has been called first with
194    /// the given `read_ports` argument.
195    ///
196    /// The first returned vector is the number of soft-decoded address bits for each write port, and the second
197    /// is the number of soft-decoded address bits for each read port included in `read_ports`.
198    fn swizzle_mux_bits(&self, read_ports: &[usize], swizzle: &MemorySwizzle) -> (Vec<usize>, Vec<usize>);
199
200    /// Performs a [`MemorySwizzle`] transformation on a memory, and returns a list of new memories with
201    /// the corresponding outputs that need to be driven.  The memories should be further lowered by the target.
202    fn swizzle(self, design: &Design, output: &Value, swizzle: &MemorySwizzle) -> Vec<(Memory, Value)>;
203
204    /// Returns true iff the memory is legal for fallback lowering.
205    ///
206    /// A memory is legal for fallback lowering iff all write ports share the same clock.
207    fn can_lower_fallback(&self) -> bool;
208
209    /// Performs fallback lowering of the memory.
210    ///
211    /// If the memory has write ports, a flip-flop is created for every bit of the memory, with associated
212    /// write address decoding logic and read muxes.  It is thus *very* inefficient unless the memory depth
213    /// is *very* small.
214    ///
215    /// If the memory has no write ports, it is lowered to just a tree of muxes for every read port, with
216    /// constants at the leafs.  On FPGA targets, this will be lowered to a fairly simple tree of LUTs,
217    /// which can be reasonably efficient for memories of small depth.
218    fn lower_fallback(self, design: &Design, output: &Value);
219}
220
221// creates a transparency emulation mux, to be used by both sync-to-async conversion and transparency emulation.
222// the data and mask given must be the same width as the read port.  the result is new data and mask:
223// if the write port is writing to the same address as the read port is reading, the write data will be
224// multiplexed onto the newly-returned data according to the write mask, and the new mask will be an OR of
225// the input mask and the write mask.  otherwise, the returned data and mask are the same as the input.
226// the mask can be None if the caller doesn't need to track it.
227fn transparency_mux(
228    memory: &Memory,
229    design: &Design,
230    read_port_index: usize,
231    write_port_index: usize,
232    data: Value,
233    mask: Option<Value>,
234) -> (Value, Option<Value>) {
235    let read_port = &memory.read_ports[read_port_index];
236    let write_port = &memory.write_ports[write_port_index];
237
238    // adjust write data/mask to read port width
239    let write_wide_log2 = write_port.wide_log2(memory);
240    let read_wide_log2 = read_port.wide_log2(memory);
241    let (write_addr, read_addr, write_data, write_mask) = match write_wide_log2.cmp(&read_wide_log2) {
242        std::cmp::Ordering::Less => {
243            // read wider than write: demux write data/mask using lower write addr bits
244            let wide_log2_diff = read_wide_log2 - write_wide_log2;
245            let select = write_port.addr.slice(..wide_log2_diff);
246            let write_addr = write_port.addr.slice(wide_log2_diff..);
247            let write_data =
248                design.add_shl(write_port.data.zext(read_port.data_len), &select, write_port.data.len() as u32);
249            let write_mask =
250                design.add_shl(write_port.mask.zext(read_port.data_len), &select, write_port.mask.len() as u32);
251            (write_addr, read_port.addr.clone(), write_data, write_mask)
252        }
253        std::cmp::Ordering::Equal => {
254            (write_port.addr.clone(), read_port.addr.clone(), write_port.data.clone(), write_port.mask.clone())
255        }
256        std::cmp::Ordering::Greater => {
257            // write wider than read: select write data/mask slices from wrport using lower read addr bits
258            let wide_log2_diff = write_wide_log2 - read_wide_log2;
259            let select = read_port.addr.slice(..wide_log2_diff);
260            let read_addr = read_port.addr.slice(wide_log2_diff..);
261            let write_data =
262                design.add_ushr(&write_port.data, &select, read_port.data_len as u32).slice(..read_port.data_len);
263            let write_mask =
264                design.add_ushr(&write_port.mask, &select, read_port.data_len as u32).slice(..read_port.data_len);
265            (write_port.addr.clone(), read_addr, write_data, write_mask)
266        }
267    };
268
269    // compare the relevant address bits
270    let max_addr_width = std::cmp::max(read_addr.len(), write_addr.len());
271    let addr_eq = design.add_eq(read_addr.zext(max_addr_width), write_addr.zext(max_addr_width));
272
273    // perform actual muxing
274    let sel_write = design.add_dedup_mux(addr_eq, &write_mask, Value::zero(write_mask.len()));
275    let new_data = design.add_bitwise_mux(sel_write, write_data, data);
276    let new_mask = mask.map(|mask| design.add_dedup_mux(addr_eq, design.add_or(&mask, write_mask), mask));
277
278    (new_data, new_mask)
279}
280
281impl MemoryExt for Memory {
282    fn extract_read_ports(&self, ports: &[usize], output: &Value) -> (Memory, Value) {
283        let mut new_memory = Memory {
284            depth: self.depth,
285            width: self.width,
286            init_value: self.init_value.clone(),
287            write_ports: self.write_ports.clone(),
288            read_ports: vec![],
289        };
290        let mut new_output = Value::new();
291        for &port_index in ports {
292            new_memory.read_ports.push(self.read_ports[port_index].clone());
293            let output_slice = self.read_port_output_slice(port_index);
294            new_output.extend(output.slice(output_slice));
295        }
296        (new_memory, new_output)
297    }
298
299    fn unmap_read_dff(&mut self, design: &Design, port_index: usize, output: &mut Value) {
300        let read_port = &mut self.read_ports[port_index];
301        let Some(port_flip_flop) = read_port.flip_flop.take() else {
302            return;
303        };
304        let new_port_output = design.add_void(read_port.data_len);
305        let mut data = new_port_output.clone();
306        for (write_port_index, relation) in port_flip_flop.relations.into_iter().enumerate() {
307            if relation == MemoryPortRelation::Transparent {
308                (data, _) = transparency_mux(self, design, port_index, write_port_index, data, None);
309            }
310        }
311        let q = design.add_dff(FlipFlop {
312            data,
313            clock: port_flip_flop.clock,
314            clear: port_flip_flop.clear,
315            load: ControlNet::Pos(Net::ZERO),
316            load_data: Value::undef(new_port_output.len()),
317            reset: port_flip_flop.reset,
318            enable: port_flip_flop.enable,
319            reset_over_enable: port_flip_flop.reset_over_enable,
320            clear_value: port_flip_flop.clear_value,
321            reset_value: port_flip_flop.reset_value,
322            init_value: port_flip_flop.init_value,
323        });
324        let output_slice = self.read_port_output_slice(port_index);
325        design.replace_value(output.slice(output_slice.clone()), q);
326        output[output_slice.clone()].copy_from_slice(&new_port_output[..]);
327    }
328
329    fn unmap_read_init_reset_transparency(
330        &mut self,
331        design: &Design,
332        port_index: usize,
333        include_transparency: bool,
334        output: &mut Value,
335    ) {
336        let read_port = &self.read_ports[port_index];
337        let port_flip_flop = read_port.flip_flop.as_ref().unwrap();
338
339        let has_init = port_flip_flop.has_init_value();
340        let has_reset = port_flip_flop.has_reset();
341        let has_clear = port_flip_flop.has_clear();
342
343        // create transparency data and mask values. data is the value that should override the read data,
344        // and mask is the validity mask corresponding to it.
345        // if not unmapping transparency, just set them to (X, 0).
346        let mut data = Value::undef(read_port.data_len);
347        let mut mask = Some(Value::zero(read_port.data_len));
348        let mut has_transparency = false;
349        if include_transparency {
350            for (write_port_index, &relation) in port_flip_flop.relations.iter().enumerate() {
351                if relation == MemoryPortRelation::Transparent {
352                    (data, mask) = transparency_mux(self, design, port_index, write_port_index, data, mask);
353                    has_transparency = true;
354                }
355            }
356        }
357        let mask = mask.unwrap();
358
359        if !has_init && !has_reset && !has_clear && !has_transparency {
360            return;
361        }
362
363        // now delay the above by one cycle, and also mix in the unmapped clear/reset/init into it.
364        let read_port = &mut self.read_ports[port_index];
365        let port_flip_flop = read_port.flip_flop.as_mut().unwrap();
366        let data = design.add_dff(FlipFlop {
367            data,
368            clock: port_flip_flop.clock,
369            clear: port_flip_flop.clear,
370            load: ControlNet::Pos(Net::ZERO),
371            load_data: Value::undef(read_port.data_len),
372            reset: port_flip_flop.reset,
373            enable: port_flip_flop.enable,
374            reset_over_enable: port_flip_flop.reset_over_enable,
375            clear_value: std::mem::replace(&mut port_flip_flop.clear_value, Const::undef(read_port.data_len)),
376            reset_value: std::mem::replace(&mut port_flip_flop.reset_value, Const::undef(read_port.data_len)),
377            init_value: std::mem::replace(&mut port_flip_flop.init_value, Const::undef(read_port.data_len)),
378        });
379        let mask = design.add_dedup_dff(FlipFlop {
380            data: mask,
381            clock: port_flip_flop.clock,
382            clear: port_flip_flop.clear,
383            load: ControlNet::Pos(Net::ZERO),
384            load_data: Value::undef(read_port.data_len),
385            reset: port_flip_flop.reset,
386            enable: port_flip_flop.enable,
387            reset_over_enable: port_flip_flop.reset_over_enable,
388            clear_value: Const::ones(read_port.data_len),
389            reset_value: Const::ones(read_port.data_len),
390            init_value: if has_init { Const::ones(read_port.data_len) } else { Const::undef(read_port.data_len) },
391        });
392        if include_transparency {
393            for relation in &mut port_flip_flop.relations {
394                if *relation == MemoryPortRelation::Transparent {
395                    *relation = MemoryPortRelation::Undefined;
396                }
397            }
398        }
399        port_flip_flop.clear = ControlNet::ZERO;
400        port_flip_flop.reset = ControlNet::ZERO;
401
402        // perform the actual muxing.
403        let new_port_output = design.add_void(read_port.data_len);
404        let mux = design.add_bitwise_mux(mask, data, &new_port_output);
405
406        // and do the replacement.
407        let output_slice = self.read_port_output_slice(port_index);
408        design.replace_value(output.slice(output_slice.clone()), mux);
409        output[output_slice.clone()].copy_from_slice(&new_port_output[..]);
410    }
411
412    fn emulate_read_before_write(&mut self, design: &Design) {
413        let mut rdfirst_write_ports = vec![false; self.write_ports.len()];
414        let mut transparent_write_ports = vec![false; self.write_ports.len()];
415
416        for port in &self.read_ports {
417            let port_flip_flop = port.flip_flop.as_ref().unwrap();
418            assert_eq!(
419                port_flip_flop.clock,
420                self.read_ports[0].flip_flop.as_ref().unwrap().clock,
421                "all read ports must have the same clock"
422            );
423            for (write_port_index, &relation) in port_flip_flop.relations.iter().enumerate() {
424                match relation {
425                    MemoryPortRelation::Undefined => (),
426                    MemoryPortRelation::ReadBeforeWrite => rdfirst_write_ports[write_port_index] = true,
427                    MemoryPortRelation::Transparent => transparent_write_ports[write_port_index] = true,
428                }
429            }
430        }
431
432        let init_undef = self.init_value.is_undef();
433
434        for (port_index, port) in self.write_ports.iter_mut().enumerate() {
435            if !rdfirst_write_ports[port_index] {
436                continue;
437            }
438            assert!(
439                !transparent_write_ports[port_index],
440                "a single write port cannot have both a read-before-write and a transparent relation to two read ports"
441            );
442            port.addr = design.add_dff(FlipFlop::new(std::mem::take(&mut port.addr), port.clock));
443            port.data = design.add_dff(FlipFlop::new(std::mem::take(&mut port.data), port.clock));
444            port.mask = design.add_dedup_dff(
445                FlipFlop::new(std::mem::take(&mut port.mask), port.clock).with_init(if init_undef {
446                    Const::undef(port.data.len())
447                } else {
448                    Const::zero(port.data.len())
449                }),
450            );
451            port.clock = !port.clock;
452            for read_port in &mut self.read_ports {
453                let read_ff = read_port.flip_flop.as_mut().unwrap();
454                read_ff.relations[port_index] = MemoryPortRelation::Undefined;
455            }
456        }
457    }
458
459    fn make_data_swizzle(&self, write_port_granularity: &[usize]) -> Vec<Option<usize>> {
460        assert_eq!(self.write_ports.len(), write_port_granularity.len());
461        let mask_slices = Vec::from_iter((0..self.width).map(|bit_index| {
462            Vec::from_iter(self.write_ports.iter().map(|port| {
463                Value::from_iter(
464                    (0..(port.data.len() / self.width))
465                        .map(|subport_index| port.mask[subport_index * self.width + bit_index]),
466                )
467            }))
468        }));
469        let mut result = vec![];
470        for bit_index in 0..self.width {
471            'check_slices: loop {
472                for (port_index, &port_granularity) in write_port_granularity.iter().enumerate() {
473                    if !result.len().is_multiple_of(port_granularity)
474                        && mask_slices[bit_index][port_index] != mask_slices[bit_index - 1][port_index]
475                    {
476                        result.push(None);
477                        continue 'check_slices;
478                    }
479                }
480                break;
481            }
482            result.push(Some(bit_index));
483        }
484        'align: loop {
485            for &port_granularity in write_port_granularity {
486                if !result.len().is_multiple_of(port_granularity) {
487                    result.push(None);
488                    continue 'align;
489                }
490            }
491            break;
492        }
493        result
494    }
495
496    fn swizzle_depths(&self, swizzle: &MemorySwizzle) -> Vec<usize> {
497        // compute how many low soft address bits there are
498        let soft_addr_bits = swizzle.soft_addr_bits_mask.count_ones() as usize;
499        // compute the dimensions of memory after swizzle steps 1-2 applied
500        let base_width = swizzle.data_swizzle.len() << soft_addr_bits;
501        let base_depth = self.depth >> soft_addr_bits;
502        // compute how many chunks created in step 5 require the full `2 ** hard_addr_bits` depth
503        let num_chunks_full = base_depth >> swizzle.hard_addr_bits;
504        let depth_full = 1 << swizzle.hard_addr_bits;
505        // compute the depth of the last, partial chunk created in step 5
506        let depth_partial = base_depth & ((1 << swizzle.hard_addr_bits) - 1);
507        // determine how much of the width of memory created in step 5 requires the full depth
508        let width_full = base_width * num_chunks_full;
509        // and determine its total width
510        let width_total = if depth_partial != 0 { width_full + base_width } else { width_full };
511        // now align up both of these numbers to `data_width_unit` chunks
512        let num_mems_full = width_full.div_ceil(swizzle.data_width_unit);
513        let num_mems_total = width_total.div_ceil(swizzle.data_width_unit);
514        // and compute the final result
515        let mut result = vec![depth_full; num_mems_full];
516        while result.len() < num_mems_total {
517            result.push(depth_partial);
518        }
519        result
520    }
521
522    fn swizzle_mux_bits(&self, read_ports: &[usize], swizzle: &MemorySwizzle) -> (Vec<usize>, Vec<usize>) {
523        let soft_addr_bits = swizzle.soft_addr_bits_mask.count_ones() as usize;
524        let chunks = self.depth.div_ceil(1 << (soft_addr_bits + swizzle.hard_addr_bits));
525        let chunks_mux_bits = if chunks <= 1 { 0 } else { (chunks - 1).ilog2() as usize + 1 };
526        let write_mux_bits = Vec::from_iter(self.write_ports.iter().zip(swizzle.write_wide_log2.iter()).map(
527            |(port, &new_wide_log2)| {
528                let old_wide_log2 = port.wide_log2(self);
529                chunks_mux_bits + soft_addr_bits + new_wide_log2 - old_wide_log2
530            },
531        ));
532        let read_mux_bits = Vec::from_iter(read_ports.iter().zip(swizzle.read_wide_log2.iter()).map(
533            |(&port_index, &new_wide_log2)| {
534                let port = &self.read_ports[port_index];
535                let old_wide_log2 = port.wide_log2(self);
536                chunks_mux_bits + soft_addr_bits + new_wide_log2 - old_wide_log2
537            },
538        ));
539        (write_mux_bits, read_mux_bits)
540    }
541
542    fn swizzle(mut self, design: &Design, output: &Value, swizzle: &MemorySwizzle) -> Vec<(Memory, Value)> {
543        // 0-width memories can have no outputs, and can just be unalived without remorse.
544        if self.width == 0 {
545            return vec![];
546        }
547        // 0-depth memories are annoying to deal with — get them out of the way.
548        if self.depth == 0 {
549            // while you cannot really read any row, the read ports can still have well-defined values
550            // because of initial values and resets.  deal with them by converting them to async ones.
551            let mut output = output.clone();
552            for port_index in 0..self.read_ports.len() {
553                self.unmap_read_dff(design, port_index, &mut output);
554            }
555            // now we have only asynchronous read ports, and can replace all their outputs with undef.
556            design.replace_value(&output, Value::undef(output.len()));
557            return vec![];
558        }
559
560        // first, replicate the calculations from `swizzle_size`
561        let soft_addr_bits = swizzle.soft_addr_bits_mask.count_ones() as usize;
562        let base_width = swizzle.data_swizzle.len() << soft_addr_bits;
563        let base_depth = self.depth >> soft_addr_bits;
564        let depth_full = 1 << swizzle.hard_addr_bits;
565        let depth_partial = base_depth & ((1 << swizzle.hard_addr_bits) - 1);
566        let num_chunks_full = base_depth >> swizzle.hard_addr_bits;
567        let num_chunks_total = if depth_partial == 0 { num_chunks_full } else { num_chunks_full + 1 };
568        let width_full = base_width * num_chunks_full;
569        let width_total = base_width * num_chunks_total;
570        let num_mems_full = width_full.div_ceil(swizzle.data_width_unit);
571        let num_mems_total = width_total.div_ceil(swizzle.data_width_unit);
572        let width_total_aligned = num_mems_total * swizzle.data_width_unit;
573
574        // performs steps 2-5 of the transformation on a port's address; the first value returned is
575        // the new address for the port (a subset of the original address bits, potentially padded with 0 at the top),
576        // while the second value is the remaining address bits (known as mux addr), which should be used to form
577        // a mux / write address decoder.
578        // the third value describes the swizzle for the data bits: it is indexed first by the mux_addr value, second by
579        // the original port's subport index. the value is (new subport index, chunk index within the new port).
580        fn swizzle_addr(
581            addr: &Value,
582            swizzle: &MemorySwizzle,
583            num_chunks_total: usize,
584            orig_wide_log2: usize,
585            new_wide_log2: usize,
586        ) -> (Value, Value, Vec<Vec<(usize, usize)>>) {
587            #[derive(Clone, Copy)]
588            enum BitDisposition {
589                NewSubportBit(usize),
590                ChunkIndexBit(usize),
591            }
592
593            let mut num_chunks_port = num_chunks_total;
594            let mut hw_addr = Value::new();
595            let mut mux_addr = Value::new();
596            let mut old_subport_bit_disposition = vec![];
597            let mut mux_addr_bit_disposition = vec![];
598            let mut new_subport_bit = 0;
599            let mut new_chunk_bit = 0;
600            for addr_bit_index in 0..orig_wide_log2 {
601                if ((swizzle.soft_addr_bits_mask >> addr_bit_index) & 1) != 0 {
602                    old_subport_bit_disposition.push(BitDisposition::ChunkIndexBit(new_chunk_bit));
603                    new_chunk_bit += 1;
604                } else {
605                    old_subport_bit_disposition.push(BitDisposition::NewSubportBit(new_subport_bit));
606                    new_subport_bit += 1;
607                }
608            }
609            assert!(new_subport_bit <= new_wide_log2);
610            for (addr_bit_index, addr_bit) in (orig_wide_log2..).zip(addr) {
611                if ((swizzle.soft_addr_bits_mask >> addr_bit_index) & 1) != 0 {
612                    mux_addr_bit_disposition.push(BitDisposition::ChunkIndexBit(new_chunk_bit));
613                    new_chunk_bit += 1;
614                    mux_addr.push(addr_bit);
615                    num_chunks_port *= 2;
616                } else if new_subport_bit != new_wide_log2 {
617                    mux_addr_bit_disposition.push(BitDisposition::NewSubportBit(new_subport_bit));
618                    new_subport_bit += 1;
619                    mux_addr.push(addr_bit);
620                    num_chunks_port *= 2;
621                } else if hw_addr.len() != swizzle.hard_addr_bits - new_wide_log2 {
622                    hw_addr.push(addr_bit);
623                } else {
624                    mux_addr_bit_disposition.push(BitDisposition::ChunkIndexBit(new_chunk_bit));
625                    new_chunk_bit += 1;
626                    mux_addr.push(addr_bit);
627                }
628            }
629            assert!(hw_addr.len() <= swizzle.hard_addr_bits - new_wide_log2);
630            while hw_addr.len() < swizzle.hard_addr_bits - new_wide_log2 {
631                hw_addr.push(Net::ZERO);
632            }
633            let useful_mux_bits = if num_chunks_port < 2 { 0 } else { (num_chunks_port - 1).ilog2() as usize + 1 };
634            while mux_addr.len() < useful_mux_bits {
635                mux_addr_bit_disposition.push(BitDisposition::ChunkIndexBit(new_chunk_bit));
636                new_chunk_bit += 1;
637                mux_addr.push(Net::ZERO);
638            }
639
640            let mut data_swizzle = Vec::from_iter(std::iter::repeat_n(vec![], num_chunks_port));
641            for port_chunk in 0..num_chunks_port {
642                for orig_subport in 0..(1 << orig_wide_log2) {
643                    let mut new_subport = 0;
644                    let mut new_chunk = 0;
645                    for (bit, disposition) in old_subport_bit_disposition.iter().copied().enumerate() {
646                        let bit_val = (orig_subport >> bit) & 1;
647                        match disposition {
648                            BitDisposition::NewSubportBit(new_bit) => new_subport |= bit_val << new_bit,
649                            BitDisposition::ChunkIndexBit(new_bit) => new_chunk |= bit_val << new_bit,
650                        }
651                    }
652                    for (bit, disposition) in mux_addr_bit_disposition.iter().copied().enumerate() {
653                        if bit >= useful_mux_bits {
654                            continue;
655                        }
656                        let bit_val = (port_chunk >> bit) & 1;
657                        match disposition {
658                            BitDisposition::NewSubportBit(new_bit) => new_subport |= bit_val << new_bit,
659                            BitDisposition::ChunkIndexBit(new_bit) => new_chunk |= bit_val << new_bit,
660                        }
661                    }
662
663                    data_swizzle[port_chunk].push((new_subport, new_chunk));
664                }
665            }
666
667            (hw_addr, mux_addr, data_swizzle)
668        }
669
670        // perform steps 1-5 of the transformation on the write ports; this is done in a somewhat weird way:
671        // each write port is split into `1 << wide_log2` chunks, which are later reassembled
672        // when emitting the final memories.  this is done to make the slicing easier.
673        let mut write_ports = vec![];
674        for (port_index, orig_port) in self.write_ports.iter().enumerate() {
675            let orig_wide_log2 = orig_port.wide_log2(&self);
676            let new_wide_log2 = swizzle.write_wide_log2[port_index];
677            let (new_addr, mux_addr, data_swizzle) =
678                swizzle_addr(&orig_port.addr, swizzle, num_chunks_total, orig_wide_log2, new_wide_log2);
679            let mut subports = vec![];
680            for _ in 0..(1 << new_wide_log2) {
681                subports.push(MemoryWritePort {
682                    addr: new_addr.clone(),
683                    data: Value::undef(width_total_aligned),
684                    mask: Value::zero(width_total_aligned),
685                    clock: orig_port.clock,
686                });
687            }
688            for (mux_index, mux_index_data_swizzle) in data_swizzle.into_iter().enumerate() {
689                let mux_match = design.add_eq(&mux_addr, Const::from_uint(mux_index as u128, mux_addr.len()));
690                let mut new_data = Value::new();
691                let mut new_mask = Value::new();
692                // when creating the write mask, whenever a dummy bit needs to be inserted, make it a duplicate
693                // of the previous bit instead of an X.  this ensures that any mask granularity constraints
694                // are met (such constraints are generally the reason why dummy bits are inserted into the swizzle
695                // anyway).
696                for orig_subport_index in 0..(1 << orig_wide_log2) {
697                    for &source in &swizzle.data_swizzle {
698                        match source {
699                            Some(index) => {
700                                new_data.push(orig_port.data[orig_subport_index * self.width + index]);
701                                new_mask.push(orig_port.mask[orig_subport_index * self.width + index]);
702                            }
703                            None => {
704                                new_data.push(Net::UNDEF);
705                                if new_mask.is_empty() {
706                                    new_mask.push(Net::UNDEF);
707                                } else {
708                                    new_mask.push(new_mask.msb());
709                                }
710                            }
711                        }
712                    }
713                }
714                if !mux_addr.is_empty() {
715                    new_mask = design.add_dedup_mux(
716                        mux_match,
717                        new_mask,
718                        Value::zero(swizzle.data_swizzle.len() << orig_wide_log2),
719                    );
720                }
721                for (orig_subport_index, (new_subport_index, new_chunk_index)) in
722                    mux_index_data_swizzle.into_iter().enumerate()
723                {
724                    let new_chunk_slice = new_chunk_index * swizzle.data_swizzle.len()
725                        ..(new_chunk_index + 1) * swizzle.data_swizzle.len();
726                    let orig_subport_slice = orig_subport_index * swizzle.data_swizzle.len()
727                        ..(orig_subport_index + 1) * swizzle.data_swizzle.len();
728                    subports[new_subport_index].data[new_chunk_slice.clone()]
729                        .copy_from_slice(&new_data[orig_subport_slice.clone()]);
730                    subports[new_subport_index].mask[new_chunk_slice].copy_from_slice(&new_mask[orig_subport_slice]);
731                }
732            }
733            write_ports.push(subports);
734        }
735
736        // likewise, perform steps 1-5 of the transformation on the read ports, also in a chunky manner.
737        let mut read_ports = vec![];
738        for (port_index, orig_port) in self.read_ports.iter().enumerate() {
739            let orig_wide_log2 = orig_port.wide_log2(&self);
740            let new_wide_log2 = swizzle.read_wide_log2[port_index];
741            let (new_addr, mut mux_addr, data_swizzle) =
742                swizzle_addr(&orig_port.addr, swizzle, num_chunks_total, orig_wide_log2, new_wide_log2);
743            let mut subports = vec![];
744            for _ in 0..(1 << new_wide_log2) {
745                subports.push((
746                    MemoryReadPort {
747                        addr: new_addr.clone(),
748                        data_len: width_total_aligned,
749                        flip_flop: orig_port.flip_flop.as_ref().map(|flip_flop| MemoryReadFlipFlop {
750                            clock: flip_flop.clock,
751                            clear: flip_flop.clear,
752                            reset: flip_flop.reset,
753                            enable: flip_flop.enable,
754                            reset_over_enable: flip_flop.reset_over_enable,
755                            clear_value: Const::undef(width_total_aligned),
756                            reset_value: Const::undef(width_total_aligned),
757                            init_value: Const::undef(width_total_aligned),
758                            relations: flip_flop.relations.clone(),
759                        }),
760                    },
761                    design.add_void(width_total_aligned),
762                ));
763            }
764            if let Some(ref flip_flop) = orig_port.flip_flop {
765                mux_addr = design.add_dff(FlipFlop::new(mux_addr, flip_flop.clock).with_enable(flip_flop.enable));
766            }
767            let mut output_chunks = vec![];
768            for mux_index_data_swizzle in data_swizzle {
769                let mut output_chunk = Value::undef(orig_port.data_len);
770                for (orig_subport_index, (new_subport_index, new_chunk_index)) in
771                    mux_index_data_swizzle.into_iter().enumerate()
772                {
773                    let new_chunk_slice = new_chunk_index * swizzle.data_swizzle.len()
774                        ..(new_chunk_index + 1) * swizzle.data_swizzle.len();
775                    if let Some(ref flip_flop) = orig_port.flip_flop {
776                        let swizzle_const = |value: &Const| {
777                            Const::from_iter(swizzle.data_swizzle.iter().map(|&source| match source {
778                                Some(index) => value[orig_subport_index * self.width + index],
779                                None => Trit::Undef,
780                            }))
781                        };
782                        let new_flip_flop = subports[new_subport_index].0.flip_flop.as_mut().unwrap();
783                        new_flip_flop.clear_value[new_chunk_slice.clone()]
784                            .copy_from_slice(&swizzle_const(&flip_flop.clear_value)[..]);
785                        new_flip_flop.reset_value[new_chunk_slice.clone()]
786                            .copy_from_slice(&swizzle_const(&flip_flop.reset_value)[..]);
787                        new_flip_flop.init_value[new_chunk_slice.clone()]
788                            .copy_from_slice(&swizzle_const(&flip_flop.init_value)[..]);
789                    }
790                    let swizzled_output_chunk = subports[new_subport_index].1.slice(new_chunk_slice);
791                    for (&index, swizzled_output_bit) in swizzle.data_swizzle.iter().zip(swizzled_output_chunk) {
792                        if let Some(index) = index {
793                            output_chunk[orig_subport_index * self.width + index] = swizzled_output_bit;
794                        }
795                    }
796                }
797                output_chunks.push(output_chunk);
798            }
799            // create the mux tree bottom-up, halving the size of the output_chunks array each time
800            for addr_bit in &mux_addr {
801                output_chunks = Vec::from_iter(output_chunks.chunks(2).map(|pair| {
802                    if pair.len() == 2 {
803                        design.add_mux(addr_bit, &pair[1], &pair[0])
804                    } else {
805                        // we may be reading out-of-bounds. could also use a mux-with-undef here, but there's little point.
806                        pair[0].clone()
807                    }
808                }))
809            }
810            let [port_output] = output_chunks.try_into().unwrap();
811            design.replace_value(output.slice(self.read_port_output_slice(port_index)), port_output);
812            read_ports.push(subports);
813        }
814
815        // prepare the memories, applying step 6 (not including initial values)
816        let mut result = vec![];
817        for index in 0..num_mems_total {
818            let data_slice = index * swizzle.data_width_unit..(index + 1) * swizzle.data_width_unit;
819            let depth = if index < num_mems_full { depth_full } else { depth_partial };
820            let mut memory = Memory {
821                depth,
822                width: swizzle.data_width_unit,
823                init_value: Const::undef(depth * swizzle.data_width_unit),
824                write_ports: vec![],
825                read_ports: vec![],
826            };
827            for write_subports in &write_ports {
828                let mut port = MemoryWritePort {
829                    addr: write_subports[0].addr.clone(),
830                    data: Value::new(),
831                    mask: Value::new(),
832                    clock: write_subports[0].clock,
833                };
834                for subport in write_subports {
835                    port.data.extend(subport.data.slice(data_slice.clone()));
836                    port.mask.extend(subport.mask.slice(data_slice.clone()));
837                }
838                memory.write_ports.push(port);
839            }
840            let mut new_output = Value::new();
841            for read_subports in &read_ports {
842                let mut port = MemoryReadPort {
843                    addr: read_subports[0].0.addr.clone(),
844                    data_len: swizzle.data_width_unit * read_subports.len(),
845                    flip_flop: read_subports[0].0.flip_flop.as_ref().map(|flip_flop| MemoryReadFlipFlop {
846                        clock: flip_flop.clock,
847                        clear: flip_flop.clear,
848                        reset: flip_flop.reset,
849                        enable: flip_flop.enable,
850                        reset_over_enable: flip_flop.reset_over_enable,
851                        clear_value: Const::new(),
852                        reset_value: Const::new(),
853                        init_value: Const::new(),
854                        relations: flip_flop.relations.clone(),
855                    }),
856                };
857                for (subport, subport_output) in read_subports {
858                    if let Some(ref subport_flip_flop) = subport.flip_flop {
859                        let port_flip_flop = port.flip_flop.as_mut().unwrap();
860                        port_flip_flop.clear_value.extend(subport_flip_flop.clear_value.slice(data_slice.clone()));
861                        port_flip_flop.reset_value.extend(subport_flip_flop.reset_value.slice(data_slice.clone()));
862                        port_flip_flop.init_value.extend(subport_flip_flop.init_value.slice(data_slice.clone()));
863                    }
864                    new_output.extend(subport_output.slice(data_slice.clone()));
865                }
866                memory.read_ports.push(port);
867            }
868            result.push((memory, new_output));
869        }
870
871        // fill initial values.
872        for row in 0..self.depth {
873            let orig_value = self.init_value.slice(row * self.width..(row + 1) * self.width);
874            // apply step 1
875            let swizzled_value = Const::from_iter(swizzle.data_swizzle.iter().map(|&index| match index {
876                None => Trit::Undef,
877                Some(index) => orig_value[index],
878            }));
879            // apply steps 2 and 5 to the row index
880            let mut new_row = 0;
881            let mut new_row_bit = 0;
882            let mut chunk_index = 0;
883            let mut chunk_index_bit = 0;
884            assert!(row < (1 << 32));
885            for bit in 0..32 {
886                let bit_value = (row >> bit) & 1;
887                if ((swizzle.soft_addr_bits_mask >> bit) & 1) != 0 || new_row_bit == swizzle.hard_addr_bits {
888                    chunk_index |= bit_value << chunk_index_bit;
889                    chunk_index_bit += 1;
890                } else {
891                    new_row |= bit_value << new_row_bit;
892                    new_row_bit += 1;
893                }
894            }
895            assert_eq!(new_row_bit, swizzle.hard_addr_bits);
896            // apply step 6
897            for (index, bit) in swizzled_value.iter().enumerate() {
898                let full_index = chunk_index * swizzle.data_swizzle.len() + index;
899                let mem_index = full_index / swizzle.data_width_unit;
900                let mem_bit_index = full_index % swizzle.data_width_unit;
901                result[mem_index].0.init_value[new_row * swizzle.data_width_unit + mem_bit_index] = bit;
902            }
903        }
904
905        result
906    }
907
908    fn can_lower_fallback(&self) -> bool {
909        for port in &self.write_ports {
910            if port.clock != self.write_ports[0].clock {
911                return false;
912            }
913        }
914        true
915    }
916
917    fn lower_fallback(mut self, design: &Design, output: &Value) {
918        let mut write_clock = None;
919        for write_port in &self.write_ports {
920            if write_clock.is_none() {
921                write_clock = Some(write_port.clock);
922            } else if write_clock != Some(write_port.clock) {
923                panic!("trying to lower memory with multiple write clocks");
924            }
925        }
926        let mut output = output.clone();
927        // convert all read ports to async first.
928        for port_index in 0..self.read_ports.len() {
929            self.unmap_read_dff(design, port_index, &mut output);
930        }
931        let max_wide_log2 = self
932            .write_ports
933            .iter()
934            .map(|port| port.wide_log2(&self))
935            .max()
936            .unwrap_or(0)
937            .max(self.read_ports.iter().map(|port| port.wide_log2(&self)).max().unwrap_or(0));
938        // create a swizzle structure.  this will effectively split the memory into one sub-memory per row.
939        let swizzle = MemorySwizzle {
940            data_swizzle: Vec::from_iter((0..self.width).map(Some)),
941            soft_addr_bits_mask: (1 << max_wide_log2) - 1,
942            write_wide_log2: vec![0; self.write_ports.len()],
943            read_wide_log2: vec![0; self.read_ports.len()],
944            hard_addr_bits: 0,
945            data_width_unit: self.width,
946        };
947        for (memory, mem_output) in self.swizzle(design, &output, &swizzle) {
948            assert_eq!(memory.depth, 1);
949            let data = if memory.write_ports.is_empty() {
950                Value::from(memory.init_value)
951            } else {
952                let q = design.add_void(memory.width);
953                let mut data = q.clone();
954                for port in memory.write_ports {
955                    assert_eq!(port.data.len(), data.len());
956                    // multiplex our data onto the chain
957                    data = design.add_bitwise_mux(port.mask, port.data, data);
958                }
959                design.replace_value(
960                    &q,
961                    design.add_dff(FlipFlop::new(data, write_clock.unwrap()).with_init(memory.init_value)),
962                );
963                q
964            };
965            // all read ports are asynchronous, and there is only a single row in the memory.
966            // just wire it straight to all read data outputs.
967            for (port_index, port) in memory.read_ports.into_iter().enumerate() {
968                design
969                    .replace_value(mem_output.slice(port_index * memory.width..(port_index + 1) * memory.width), &data);
970                assert_eq!(port.data_len, data.len());
971            }
972        }
973    }
974}
975
976trait DesignExt {
977    fn add_dedup_dff(&self, flip_flop: FlipFlop) -> Value;
978    fn add_dedup_mux(&self, arg1: impl Into<ControlNet>, arg2: impl Into<Value>, arg3: impl Into<Value>) -> Value;
979    fn add_bitwise_mux(&self, arg1: impl Into<Value>, arg2: impl Into<Value>, arg3: impl Into<Value>) -> Value;
980}
981
982impl DesignExt for Design {
983    fn add_dedup_dff(&self, mut flip_flop: FlipFlop) -> Value {
984        let mut cache = BTreeMap::new();
985        let mut result_swizzle = vec![];
986        let mut dedup_data = Value::new();
987        let mut dedup_load_data = Value::new();
988        let mut dedup_reset_value = Const::new();
989        let mut dedup_clear_value = Const::new();
990        let mut dedup_init_value = Const::new();
991
992        for ((((data_bit, load_data_bit), reset_bit), clear_bit), init_bit) in flip_flop
993            .data
994            .iter()
995            .zip(flip_flop.load_data.iter())
996            .zip(flip_flop.reset_value)
997            .zip(flip_flop.clear_value)
998            .zip(flip_flop.init_value)
999        {
1000            match cache.entry((data_bit, load_data_bit, reset_bit, clear_bit, init_bit)) {
1001                btree_map::Entry::Vacant(entry) => {
1002                    let index = dedup_data.len();
1003                    dedup_data.push(data_bit);
1004                    dedup_load_data.push(load_data_bit);
1005                    dedup_reset_value.push(reset_bit);
1006                    dedup_clear_value.push(clear_bit);
1007                    dedup_init_value.push(init_bit);
1008                    result_swizzle.push(index);
1009                    entry.insert(index);
1010                }
1011                btree_map::Entry::Occupied(entry) => {
1012                    result_swizzle.push(*entry.get());
1013                }
1014            }
1015        }
1016        flip_flop.data = dedup_data;
1017        flip_flop.load_data = dedup_load_data;
1018        flip_flop.reset_value = dedup_reset_value;
1019        flip_flop.clear_value = dedup_clear_value;
1020        flip_flop.init_value = dedup_init_value;
1021        let dedup_result = self.add_dff(flip_flop);
1022        Value::from_iter(result_swizzle.iter().map(|&index| dedup_result[index]))
1023    }
1024
1025    fn add_dedup_mux(&self, arg1: impl Into<ControlNet>, arg2: impl Into<Value>, arg3: impl Into<Value>) -> Value {
1026        let arg1 = arg1.into();
1027        let arg2 = arg2.into();
1028        let arg3 = arg3.into();
1029        let mut cache = BTreeMap::new();
1030        let mut result_swizzle = vec![];
1031        let mut dedup_arg2 = Value::new();
1032        let mut dedup_arg3 = Value::new();
1033        for (arg2_bit, arg3_bit) in arg2.iter().zip(arg3) {
1034            match cache.entry((arg2_bit, arg3_bit)) {
1035                btree_map::Entry::Vacant(entry) => {
1036                    let index = dedup_arg2.len();
1037                    dedup_arg2.push(arg2_bit);
1038                    dedup_arg3.push(arg3_bit);
1039                    result_swizzle.push(index);
1040                    entry.insert(index);
1041                }
1042                btree_map::Entry::Occupied(entry) => {
1043                    result_swizzle.push(*entry.get());
1044                }
1045            }
1046        }
1047        let dedup_result = self.add_mux(arg1, dedup_arg2, dedup_arg3);
1048        Value::from_iter(result_swizzle.iter().map(|&index| dedup_result[index]))
1049    }
1050
1051    fn add_bitwise_mux(&self, arg1: impl Into<Value>, arg2: impl Into<Value>, arg3: impl Into<Value>) -> Value {
1052        let arg1 = arg1.into();
1053        let arg2 = arg2.into();
1054        let arg3 = arg3.into();
1055        let mut result = Value::new();
1056        let mut index = 0;
1057        assert_eq!(arg1.len(), arg2.len());
1058        assert_eq!(arg1.len(), arg3.len());
1059        while index < arg1.len() {
1060            let mut end_index = index;
1061            while end_index < arg1.len() && arg1[end_index] == arg1[index] {
1062                end_index += 1;
1063            }
1064            let arg2_chunk = arg2.slice(index..end_index);
1065            let arg3_chunk = arg3.slice(index..end_index);
1066            result.extend(self.add_dedup_mux(arg1[index], arg2_chunk, arg3_chunk));
1067            index = end_index;
1068        }
1069        result
1070    }
1071}