%% Copyright (C) by Paulo Flores. %% %% Time-stamp: 2015-01-27 16:59:37 team.bib pff(at)inesc-id.pt %% %% Desc: articles e InProceedings para oprojecto FCT: apporx %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % to generate a list of all bibliografy % printbibkey /home/pff/lib/tex/bib/pff-author.bib % mv test.bbl xx ; sed -e "s/\\LARGE/\\clearpage \\LARGE/g" xx> test.bbl % to generate a hmtl file see bbl2html.pl % to check dup references % grep eta ~/lib/tex/bib/pff.bib | cut -d\{ -f 2 |tr [A-Z] [a-z]|sort|uniq -c %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @String{PFF-AUTHOR = "Paulo Flores" } @Article{Aksoy-TSP15-RefA, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Exact and Approximate Algorithms for the Filter Design Optimization Problem", journal = TSP, volume = 63, number = 1, pages = " 142--154 ", month = JAN, year = 2015, abstract = "The filter design optimization (FDO) problem is For Revi defined as finding a set of filter coefficients which yields a filter design with minimum complexity, satisfying the filter constraints. It has received a tremendous interest due to the widespread application of filters. Assuming that the coefficient multiplications in the filter design are realized under a shift-adds architecture, the complexity is generally defined in terms of the total number of adders and subtractors. In this article, we present an exact FDO algorithm that can guarantee the minimum design complexity under the minimum quantization value, but can only be applied to filters with a small number of coefficients. We also introduce an approximate algorithm that can handle filters with a large number of coefficients using less computational resources than the exact FDO algorithm and find better solutions than existing FDO heuristics. We describe how these algorithms can be modified to handle a delay constraint in the shift-adds designs of the multiplier blocks and to target different filter constraints and filter forms. Experimental results show the effectiveness of the proposed algorithms with respect to prominent FDO algorithms and explore the impact of design parameters, such as the filter length, quantization value, and filter form, on the complexity and performance of filter designs.", keywords = "Finite Impulse Response (FIR) filters; Direct and trans posed forms; Filter design optimization problem; Multiplierless design; Depth-first and local search methods; Delay reduction; Filter design and structures; DSP Algorithm implementation in hardware and software.", issn = "1053-587X", doi = "10.1109/TSP.2014.2366713" } @Article{Aksoy-TODAES14-RefB, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Multiplierless Design of Folded {DSP} Blocks", journal = TODAES, volume = 20, number = 1, pages = " 14:1--14:24 ", month = NOV, year = 2014, abstract = "This article addresses the problem of optimizing the implementation cost of the time-multiplexed constant multiplication (TMCM) operation which realizes the multiplication of an input variable by a single constant selected from a set of multiple constants at a time. It presents an efficient algorithm, called ORPHEUS, which finds a multiplierless TMCM design by sharing logic operators, i.e., adders, subtractors, adders/subtractors, and multiplexors (MUXes). Moreover, this article introduces folded design architectures for the digital signal processing (DSP) blocks, such as finite impulse response (FIR) filters and linear DSP transforms, and describes how these folded DSP blocks can be efficiently realized using TMCM operations optimized by ORPHEUS. Experimental results indicate that ORPHEUS can find better solutions than existing TMCM algorithms, yielding TMCM designs requiring less area. They also show that the folded architectures lead to alternative designs with significantly less area, but incurring an increase in latency and energy consumption, compared to the parallel architecture.", keywords = "Time-multiplexed constant multiplication; Folded architecture, multiplierless design; Area optimization; Finite Impulse Response (FIR) filter, Discrete Cosine Transform (DCT); Integer Cosine Transform (ICT); Folded architectures.", issn = "1084-4309", doi = "10.1145/2663343" } @Article{Neves-TVLSI14, author = "Nuno Neves and Nuno Sebastião and David Matos and Pedro Tomás and Paulo Flores and Nuno Roma", title = "Multicore {SIMD} {ASIP} for Next-Generation Sequencing and Alignment Biochip Platforms", journal = TVLSI, pages = " ??--?? ", month = JUL # " 18 ", year = 2014, abstract = "Targeting the development of new biochip platforms capable of autonomously sequencing and aligning biological sequences, a new multicore processing structure is proposed in this manuscript. This multicore structure makes use of a shared memory model and multiple instantiations of a novel application-specific instruction-set processor (ASIP) to simultaneously exploit both fine and coarse-grained parallelism and to achieve high performance levels at low-power consumption. The proposed ASIP is built by extending the instruction set architecture of a synthesizable processor, including both general and special-purpose single-instruction multiple-data instructions. This allows an efficient exploitation of fine-grained parallelism on the alignment of biological sequences, achieving over 30x speedup when compared with sequential algorithmic implementations. The complete system was prototyped on different field-programmable gate array platforms and synthesized with a 90-nm CMOS process technology. Experimental results demonstrate that the multicore structure scales almost linearly with the number of instantiated cores, achieving performances similar to a quad-core Intel Core i7 3820 processor, while using 25x less energy.", keywords = "Application Specific Instruction-Set Architecture; Single-Instruction Multiple-Data; Multi-Core Architecture, Biological Sequences Alignment; Biochip Platforms.", issn = "1063-8210", doi = "10.1109/TVLSI.2014.2333757", note = ONLINE* } @Article{Aksoy-JCSSP14, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "A Tutorial on Multiplierless Design of {FIR} Filters: Algorithms and Architectures", journal = JCSSP, volume = 33, number = 6, pages = " 1689--1719 ", location = "Springer US", month = JUN, year = 2014, abstract = "Finite impulse response (FIR) filtering is a ubiquitous operation in digital signal processing systems and is generally implemented in full custom circuits due to high-speed and low-power design requirements. The complexity of an FIR filter is dominated by the multiplication of a large number of filter coefficients by the filter input or its time-shifted versions. Over the years, many high-level synthesis algorithms and filter architectures have been introduced in order to design FIR filters efficiently. This article reviews how constant multiplications can be designed using shifts and adders/subtractors that are maximally shared through a high-level synthesis algorithm based on some optimization criteria. It also presents different forms of FIR filters, namely, direct, transposed, and hybrid and shows how constant multiplications in each filter form can be realized under a shift-adds architecture. More importantly, it explores the impact of the multiplierless realization of each filter form on area, delay, and power dissipation of both custom (ASIC) and reconfigurable (FPGA) circuits by carrying out experiments with different bitwidths of filter input, design libraries, reconfigurable target devices, and optimization criteria in high-level synthesis algorithms.", keywords = "FIR filter;Direct, transposed, and hybrid forms; Multiplierless design; High-level synthesis; Area and delay optimization; Custom and reconfigurable circuits.", issn = "0278-081X (Print) 1531-5878 (Online)", doi = "10.1007/s00034-013-9727-8" } @Article{Brito-TVLSI14, author = "Diogo Brito and Taimur Rabuske and Jorge Fernandes and Paulo Flores and José Monteiro", title = "Quaternary Logic Look-up Table in Standard {CMOS}", journal = TVLSI, pages = " ??--?? ", location = "IEEE", month = MAR # ", 19", year = 2014, abstract = "Interconnections are increasingly the dominant contributor to delay, area and energy consumption in CMOS digital circuits. Multiple-valued logic can decrease the average power required for level transitions and reduces the number of required interconnections, hence also reducing the impact of interconnections on overall energy consumption. In this paper, we propose a quaternary lookup table (LUT) structure, designed to replace or complement binary LUTs in field programmable gate arrays. The circuit is compatible with standard CMOS processes, with a single voltage supply and employing only simple voltage-mode structures. A clock boosting technique is used to optimize the switches resistance and power consumption. The proposed implementation overcomes several limitations found in previous quaternary implementations published so far, such as the need for special features in the CMOS process or power-hungry current-mode cells. We present a full adder prototype based on the designed LUT, fabricated in a standard 130-nm CMOS technology, able to work at 100 MHz while consuming 122 W. The experimental results demonstrate the correct quaternary operation and confirm the power efficiency of the proposed design.", keywords = "Field-Programmable Gate Array (FPGA); Lookup Table (LUT); Multiple-Valued Logic (MVL); Quaternary Logic; Standard CMOS Technology.", issn = "1063-8210", doi = "10.1109/TVLSI.2014.2308302", note = ONLINE* } @Article{Sebastiao-CCPE13, author = "Nuno Sebastião and Nuno Roma and Paulo Flores", title = "Configurable and scalable class of high performance hardware accelerators for simultaneous {DNA} sequence alignment", journal = CCPE, volume = 25, number = 10, pages = " 1319--1339 ", month = JUL, year = 2013, abstract = "A new class of efficient and flexible hardware accelerators for DNA local sequence alignment based on the widely used Smith-Waterman algorithm is proposed in this paper. This new class of accelerating structures exploits an innovative technique that tracks the origin coordinates of the best alignment to allow a significant reduction of the size of the dynamic programming matrix that needs to be recomputed during the subsequent traceback phase, providing a considerable reduction of the resulting time and memory requirements. The significant performance of the enhanced class of accelerators is attained by also providing support for an additional level of parallelism: the capability to concurrently align several query sequences with one or more reference sequences, according to the specific application requisites. Moreover, the accelerator class also includes specially designed processing elements that improve the resource usage when implemented in a Field Programmable Gate Array (FPGA), and easily provide several different configurations in an Application Specific Integrated Circuit (ASIC) implementation. Obtained results demonstrated that speedups as high as 278 can be obtained in ASIC accelerating structures. A FPGA-based prototyping platform, operating at a 40 times lower clock frequency and incorporating a complete alignment embedded system, still provides significant speedups as high as 27, compared with a pure software implementation.", keywords = "Hardware accelerator; DNA; Local Sequence Alignment; Traceback; FPGA; ASIC.", issn = "1532-0634", doi = "10.1002/cpe.2934", note = "Special Issue: High performance computing and simulation: architectures, systems, algorithms, technologies, services, and applications" } @Article{Aksoy-TVLSI13-RefC, author = "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and Paulo Flores and José Monteiro", title = "Design of Digit-Serial {FIR} Filters: Algorithms, Architectures, and a {CAD} Tool", journal = TVLSI, volume = 21, number = 3, pages = " 498--511 ", month = MAR, year = 2013, abstract = "In the last two decades, many efficient algorithms and architectures have been introduced for the design of low-complexity bit-parallel multiple constant multiplications (MCM) operation which dominates the complexity of many digital signal processing systems. On the other hand, little attention has been given to the digit-serial MCM design that offers alternative low-complexity MCM operations albeit at the cost of an increased delay. In this paper, we address the problem of optimizing the gate-level area in digit-serial MCM designs and introduce high-level synthesis algorithms, design architectures, and a computer-aided design tool. Experimental results show the efficiency of the proposed optimization algorithms and of the digit-serial MCM architectures in the design of digit-serial MCM operations and finite impulse response filters.", keywords = "0-1 Integer Linear Programming (ILP); Digit-Serial Arithmetic; Finite Impulse Response (FIR) Filters; Gate-Level Area Optimization; Multiple Constant Multiplications.", issn = "1063-8210", doi = "10.1109/TVLSI.2012.2188917", history = "Accepted and re-submitted on 2011-11-17" } @Article{Sebastiao-TVLSI12, author = "Nuno Sebastião and Nuno Roma and Paulo Flores", title = "Integrated Hardware Architecture for Efficient Computation of the n-Best Bio-Sequence Local Alignments in Embedded Platforms", journal = TVLSI, volume = 20, number = 7, pages = " 1262 -- 1275 ", month = JUL, year = 2012, abstract = "A flexible hardware architecture that implements a set of new and efficient techniques to significantly reduce the computational requirements of the commonly used Smith-Waterman sequence alignment algorithm is presented. Such innovative techniques use information gathered by the hardware accelerator during the computation of the alignment scores to constrain the size of the subsequence that has to be post-processed in the traceback phase using a general purpose processor (GPP). Moreover, the proposed structure is also capable of computing the n-best local alignments according to the Waterman-Eggert algorithm, becoming the first hardware architecture that is able to simultaneously evaluate the n-best alignments of a given sequence pair, by incorporating a set of ordering units that work in parallel with the systolic array. A complete alignment system was developed and implemented in a Virtex-4 FPGA, by integrating the proposed accelerator architecture with a Leon3 GPP. The obtained experimental results demonstrate that the proposed systm is flexible and allows the alignment of large sequences in memory constrained systems. As an example, a speedup of 17 was obtained with the conceived system when compared to a regular implementation of the LALIGN35 program running on an Intel Core2 Duo processor running at a 40 times higher frequency.", keywords = "DNA sequence alignment; Smith-Waterman (S-W); Waterman-Eggert (W-E); Hardware accelerator.", issn = "1063-8210", doi = "10.1109/TVLSI.2011.2157541" } @Article{Aksoy-INTEGRATION12, author = "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and Paulo Flores and José Monteiro", title = "High-level algorithms for the optimization of gate-level area in digit-serial multiple constant multiplications", journal = INTEGRATION, volume = 45, number = 3, pages = " 294--306 ", month = JUN, year = 2012, abstract = "The last two decades have seen tremendous effort on the development of high-level synthesis algorithms for efficient realization of the multiplication of a variable by a set of constants using only addition, subtraction, and shift operations. These algorithms generally target the minimization of the number of adders and subtractors, assuming that shifts are realized using only wires due to the bit-parallel processing of the input data. On the other hand, digit-serial architectures offer alternative low-complexity designs since digit-serial operators occupy less area and are independent of the data wordlength. However, in this case, shifts are no longer free in terms of hardware and require D flip-flops. Moreover, each digit-serial addition, subtraction, and shift operation has different implementation cost at gate-level. Hence, this article introduces high-level algorithms that optimize the area of digit-serial constant multiplications under the shift-adds architecture by taking into account the implementation cost of each operation at gate-level. Experimental results indicate that our high-level algorithms obtain better solutions than prominent algorithms designed for the minimization of the number of operations in terms of gate-level area and their solutions lead to less complex digit-serial MCM designs. It is also shown that the use of shift-adds architecture yields significant area reductions when compared to the constant multiplications designed using generic digit-serial constant multipliers.", keywords = "Multiple constant multiplications; Digit-serial architectures; Gate-level area optimization; 0-1 Integer linear programming; Graph-based algorithms.", issn = "0167-9260", doi = "10.1016/j.vlsi.2011.11.008", note = "Special Issue of GLSVLSI 2011: Current Trends on VLSI and Ultra Low-Power Design" } @Article{Sebastiao-MICPRO12, author = "Nuno Sebastião and Nuno Roma and Paulo Flores", title = "Hardware accelerator architecture for simultaneous short-read {DNA} sequences alignment with enhanced traceback phase", journal = MICPRO, publisher = "Elsevier Science Publishers B. V.", volume = 36, number = 2, pages = " 96--109 ", month = MAR, year = 2012, abstract = "Dynamic programming algorithms are widely used to find the optimal sequence alignment between any two DNA sequences. This manuscript presents a new, flexible and scalable hardware accelerator architecture to speedup the implementation of the frequently used Smith-Waterman algorithm. When integrated with a general purpose processor, the developed accelerator significantly reduces the computation time and memory space requirements of alignment tasks. Such efficiency mainly comes from two innovative techniques that are proposed. First, the usage of the maximum score cell coordinates, gathered during the computation of the alignment scores in the matrix-fill phase, in order to significantly reduce the time and memory requirements of the traceback phase. Second, the exploitation of an additional level of parallelism in order to simultaneously align several query sequences with the same reference sequence, targeting the processing of short-read DNA sequences. The results obtained from the implementation of a complete alignment system based on the new accelerator architecture in a Virtex-4 FPGA showed that the proposed techniques are feasible and the developed accelerator is able to provide speedups as high as 16 for the considered test sequences. Moreover, it was also shown that the proposed approach allows the processing of larger DNA sequences in memory restricted environments.", keywords = "DNA; Hardware accelerator; Local sequence alignment; Traceback.", issn = "0141-9331", doi = "10.1016/j.micpro.2011.05.003", note = "Special Issue - Exploitation Of Hardware Accelerators" } @Article{Aksoy-TODAES12, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Optimization Algorithms for the Multiplierless Realization of Linear Transforms", journal = TODAES, volume = 17, number = 1, pages = " 3:1 -- 3:27 ", month = JAN, year = 2012, abstract = "This article addresses the problem of finding the fewest number of addition and subtraction operations in the multiplication of a constant matrix with an input vector, a fundamental operation in many linear digital signal processing transforms. We first introduce an exact Common Subexpression Elimination (CSE) algorithm that formalizes the minimization of the number of operations as a 0-1 integer linear programming problem. Since there are still instances that the proposed exact algorithm cannot handle due to the NP-completeness of the problem, we also introduce a CSE heuristic algorithm that iteratively finds the most common 2-term subexpressions with the minimum conflicts among the expressions. Furthermore, since the main drawback of CSE algorithms is their dependency on a particular number representation, we propose a hybrid algorithm that initially finds promising realizations of linear transforms using a numerical difference method, and then applies the proposed CSE algorithm to utilize the common subexpressions iteratively. The experimental results on a comprehensive set of instances indicate that the proposed approximate algorithms find competitive results with those of the exact CSE algorithm and obtain better solutions than prominent previously proposed heuristics. It is also observed that our solutions yield significant area reductions in the design of linear transforms after circuit synthesis compared to direct realizations of linear transforms.", keywords = "0-1 Integer Linear Programming; Constant Matrix-Vector Multiplication; Common Subexpression Elimination; Numerical Difference Method.", issn = "1084-4309", doi = "10.1145/2071356.2071359" } @Article{Aksoy-MICPRO11, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Finding the Optimal Tradeoff Between Area and Delay in Multiple Constant Multiplications", journal = MICPRO, volume = 35, number = 8, pages = " 729--741 ", month = NOV, year = 2011, abstract = "Over the years many efficient algorithms for the multiplierless design of multiple constant multiplications (MCM) have been introduced. These algorithms primarily focus on finding the fewest number of addition/subtraction operations that generate the MCM. Although the complexity of an MCM design is decreased by reducing the number of operations, their solutions may not lead to an MCM design with optimal area at gate-level since they do not consider the implementation costs of the operations in hardware. This article introduces two approximate algorithms that aim to optimize the area of the MCM operation by taking into account the gate-level implementation of each addition and subtraction operation which realizes a constant multiplication. To find the optimal tradeoff between area and delay, the proposed algorithms are further extended to find an MCM design with optimal area under a delay constraint. Experimental results clearly indicate that the solutions of the proposed algorithms lead to significantly better MCM designs at gate-level when compared to those obtained by the solutions of algorithms designed for the optimization of the number of operations.", keywords = "Multiple constant multiplications; Addition and subtraction architectures; Gate-level area optimization; Delay aware area optimization; 0-1 integer linear programming; Graph-based algorithms.", issn = "0141-9331", doi = "10.1016/j.micpro.2011.08.009" } @Article{Lazzari-JOLPE11, author = "Cristiano Lazzari and Jorge Fernandes and Paulo Flores and José Monteiro", title = "Low Power Multiple-value Voltage-mode Look-up Table for Quaternary Field Programmable Gate Arrays", journal = JOLPE, volume = 7, number = 2, pages = " 294--301 ", month = APR, year = 2011, abstract = "FPGA structures are widely used as they enable early time-to-market and reduced non-recurring engineering costs in comparison to ASIC designs. Interconnections play a crucial role in modern FPGAs, because they dominate delay, power and area. Multiple-valued logic allows the reduction of the number of interconnections in the circuit, hence can serve as a mean to effectively curtail the impact of interconnections. In this work we propose a new look-up table structure based on a low-power high-speed quaternary voltage-mode device. The most important characteristics of the proposed architecture are that it is a voltage-mode structure, which allows reduced power consumption, and it is implemented using a standard CMOS technology. Our quaternary implementation overcomes previous proposed techniques with simple and efficient CMOS structures. Moreover, results show significant reductions on power consumption and timing in comparison to binary implementations with similar functionality.", keywords = "Multiple-value Logic; Quaternary Logic; Look-up Tables; FPGAs; Standard CMOS Technology.", issn = "1546-1998", doi = "10.1166/jolpe.2011.1138" } @Article{Aksoy-MICPRO10-RefE, author = "Levent Aksoy and Ece Olcay Gunes and Paulo Flores", title = "Search algorithms for the multiple constant multiplications problem: Exact and approximate", journal = MICPRO, volume = 34, number = 5, pages = " 151--162 ", month = AUG, year = 2010, abstract = "This article addresses the multiplication of one data sample with multiple constants using addition/subtraction and shift operations, i.e., the multiple constant multiplications (MCM) operation. In the last two decades, many efficient algorithms have been proposed to implement the MCM operation using the fewest number of addition and subtraction operations. However, due to the NP-hardness of the problem, almost all the existing algorithms have been heuristics. The main contribution of this article is the proposal of an exact depth-first search algorithm that, using lower and upper bound values of the search space for the MCM problem instance, finds the minimum solution consuming less computational resources than the previously proposed exact breadth-first search algorithm. We start by describing the exact breadth-first search algorithm that can be applied on real mid-size instances. We also present our recently proposed approximate algorithm that finds solutions close to the minimum and is able to compute better bounds for the MCM problem. The experimental results clearly indicate that the exact depth-first search algorithm can be efficiently applied to large size hard instances that the exact breadth-first search algorithm cannot handle and the heuristics can only find suboptimal solutions.", keywords = "Multiple constant multiplications problem; Depth-first search; Breadth-first search; Graph-based algorithms; Finite Impulse Response filters.", issn = "0141-9331", doi = "10.1016/j.micpro.2009.10.001", note = "Special issue on selected papers from NORCHIP 2008." } @Article{Morgado-TODAES09, author = "P. Marques Morgado and Paulo F. Flores and L. Miguel Silveira", title = "Generating Realistic Stimuli for Accurate Power Grid Analysis", journal = TODAES, volume = 14, number = 3, pages = " 40:1--40:26 ", month = MAY, year = 2009, abstract = "Power Analysis tools are an integral component of any current power sign-off methodology. The performance of a design's power grid affects the timing and functionality of a circuit directly impacting the overall performance. Ensuring power grid robustness implies taking into account, among others, static and dynamic effects of voltage drop, ground bounce and electromigration. This type of verification is usually done by simulation, targeting a worst-case scenario where devices, switching almost simultaneously, could impose stern current demands on the power grid. While determination of the exact worst-case switching conditions from the grid perspective is usually not practical, the choice of simulation stimuli has a critical effect on the results of the analysis. Targetting safe but unrealistic settings could lead to pessimistic results and costly over-designs in terms of die area. In this paper we describe a software tool that generates a reasonable, realistic, set of stimuli for simulation. The approach proposed accounts for timing and spatial restrictions that arise from the circuit's netlist and placement and generates an approximation to the worst-case condition. The resulting stimuli indicates that only a fraction of the gates change in any given timing window, leading to a more robust verification methodology, specially in the dynamic case. Generating such stimuli is akin to performing a standard static timing analysis, so the tool fits well within conventional design frameworks. Furthermore, the tool can be used for hot-spot detection in early design stages.", keywords = "Power grid; Verification; Simulation; Voltage drop; Ground bounce; Stimuli Generationy.", issn = "1084-4309", doi = "10.1145/1529255.1529262", note = "Special issue on Demonstrable Sfotware Systems and Hardware Platforms II" } @Article{Aksoy-TCADICS08, author = "Levent Aksoy and Eduardo da Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Exact and Approximate Algorithms for the Optimization of Area and Delay in Multiple Constant Multiplications", journal = TCADICS, volume = 27, number = 6, pages = " 1013--1026 ", month = JUN, year = 2008, abstract = "The main contribution of this paper is an exact common subexpression elimination algorithm for the optimum sharing of partial terms in multiple constant multiplications (MCMs). We model this problem as a Boolean network that covers all possible partial terms that may be used to generate the set of coefficients in the MCM instance. We cast this problem into a 0{\'o}1 integer linear programming (ILP) problem by requiring that the single output of this network is asserted while minimizing the number of gates representing operations in the MCM implementation that evaluate to one. A satisfiability (SAT)-based 0{\'o}1 ILP solver is used to obtain the exact solution. We argue that for many real problems, the size of the problem is within the capabilities of current SAT solvers. Because performance is often a primary design parameter, we describe how this algorithm can be modified to target the minimum area solution under a user-specified delay constraint. Additionally, we propose an approximate algorithm based on the exact approach with extremely competitive results. We have applied these algorithms on the design of digital filters and present a comprehensive set of results that evaluate ours and existing approximation schemes against exact solutions under different number representations and using different SAT solvers.", keywords = "Canonical signed digit (CSD); common subexpression elimination (CSE); delay constraints; minimal signed digit (MSD); multiple constant multiplications (MCMs); pseudo- Boolean optimization (PBO).", issn = "0278-0070", doi = "10.1109/TCAD.2008.923242" } @Article{Gil-JSAT08, author = "Lu{\'\i}s Gil and Paulo Flores and Lu{\'\i}s Miguel Silveira", title = "{PMSat}: a parallel version of {MiniSAT}", journal = JSAT, volume = 6, pages = " 71--98 ", year = 2008, abstract = "Parallel computing has become an affordable reality forcing a shift in the programming paradigm from sequential to concurrent applications, specially those who demand much computational power or with large search spaces like SAT-solvers. In this context we present the research, planning and implementation of PMSat: a parallel version of MiniSAT with MPI (Message Passing Interface) technology, to be executed in clusters or grids of computers. The main features of the program are described: search modes, assumptions pruning and share of learnt clauses. An analysis of its performance and load balance is also presented.", keywords = "Parallel Computing; SAT-solver; Satisfiability; Message Passing Interface.", issn = "1574-0617", annote = "Submitted, week-reject 2008-05-17, resubmited 2008-08-27", history = "Submitted, week-reject 2008-05-17, resubmited 2008-08-27" } @Article{Flores-TODAES01, author = "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P. Marques-Silva", title = "An exact solution to the minimum size test pattern problem", journal = TODAES, publisher = "ACM Press", address = "New York, NY, USA", volume = 6, number = 4, pages = " 629--644 ", month = OCT, year = 2001, abstract = "This article addresses the problem of test pattern generation for single stuck-at faults in combinational circuits, under the additional constraint that the number of specified primary input assignments is minimized. This problem has different applications in testing, including the identification of ``don't care'' conditions to be used in the synthesis of Built-In Self-Test (BIST) logic. The proposed solution is based on an integer linear programming (ILP) formulation which builds on an existing Propositional Satisfiability (SAT) model for test pattern generation. The resulting ILP formulation is linear on the size of the original SAT model for test generation, which is linear on the size of the circuit. Nevertheless, the resulting ILP instances represent complex optimization problems, that require dedicated ILP algorithms. Preliminary results on benchmark circuits validate the practical applicability of the test pattern minimization model and associated ILP algorithm.", keywords = "Automatic Test Pattern Generation (ATPG); Built-In Self-Test (BIST); Integer Linear Programming (ILP); propositional satisfiability (SAT); verification and test.", issn = "1084-4309", doi = "http://doi.acm.org/10.1145/502175.502186" } @Article{Sousa-Ingenium92, author = "Ana Sousa and Jos{\'e} Abreu and Paulo Flores", title = "{VHDL} novo suporte às metodologias de {CAD}", journal = "Ingenium. Revista da Ordem dos Engenheiros)", pages = " 49--55 ", month = APR, year = 1992, keywords = " " } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @String{PFF-AUTHOR = "Paulo Flores" } @InProceedings{Aksoy-ISCAS15, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Approximation of Multiple Constant Multiplications Using Minimum Look-Up Tables on {FPGA}", booktitle = ISCAS, pages = " ??--?? ", location = "Lisbon, Portugal", month = MAY # " 24--27, ", year = 2015, abstract = "In many digital signal processing (DSP) systems, computations can be carried out within a tolerable error range rather than finding the exact output, enabling significant reductions in area, delay, or power dissipation of the design. This paper addresses the problem of approximating the multiple constant multiplications (MCM) operation which occurs frequently in DSP applications. We consider the realization of constant multiplications using look-up tables (LUTs) on field programmable gate arrays (FPGA) and introduce an exact algorithm, called THETIS, that can find a minimum number of distinct LUTs required to realize the partial products of constant multiplications, satisfying an error constraint. Experimental results show that THETIS can achieve significant reductions in number of LUTs on MCM instances and its solutions lead to less complex filter designs on FPGA than those realized using original filter coefficients. ", keywords = "Aproximating computing, multiple constant multiplications (MCM), FPGA", note = ACCEPTED*, annote = "Poster" } @InProceedings{Aksoy-ICCD14, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Efficient Design of {FIR} Filters Using Hybrid Multiple Constant Multiplications on {FPGA}", booktitle = ICCD, publisher = "IEEE", pages = " 42--47 ", location = "Seoul, Korea", month = OCT # " 19--22, ", year = 2014, abstract = "The multiple constant multiplication (MCM) block, that realizes the multiplication of constants by a variable, is a ubiquitous operation in digital signal processing (DSP) systems. It can be implemented using generic multipliers or shifts and adders/subtractors. This paper addresses the problem of finding the minimum number of adders/subtractors to realize the MCM block while a number of multipliers are available to realize some constant multiplications. Such a situation appears in the design of DSP systems on field programmable gate arrays (FPGAs) which also include generic multipliers. We present a 0-1 integer linear programming (ILP) formulation of this problem, yielding an exact common subexpression elimination (CSE) method. Due to the NP-completeness of this problem, we also introduce an approximate graph-based (GB) algorithm. Experimental results show that the proposed methods can find better solutions than a state-of-art algorithm and the use of different numbers of multipliers in the MCM block leads to filter designs with different number of slices, delay, and power dissipation which enable a designer to choose the one that fits best in an application.", keywords = "Multiple Constant Multiplication (MCM); FPGAs; Exact Common Subexpression elimination (CSE) method; Approximate Graph-Based (GB) algorithm; Integer Linear Programming (ILP).", doi = "10.1109/ICCD.2014.6974660" } @InProceedings{Sebastiao-HPCS14, author = "Nuno Sebastião and Paulo Flores and Nuno Roma", title = "Optimized {ASIP} Architecture for Compressed {BWT}-Indexed Search in Bioinformatics Applications", booktitle = HPCS, publisher = "IEEE", pages = " 527--534 ", location = "Bologna, Italy", month = JUL # " 21--25, ", year = 2014, abstract = "Compressed indexes are adopted by a vast set of bioinformatics applications that deal with extremely large datasets, mainly due to the inherently high memory requirements of uncompressed alternatives. However, the additional computational overhead that is imposed by the usage of such indexes makes them harder to implement in embedded computational platforms, such as biochips, with strict processing and power restrictions. Furthermore, compressed indexes are often characterized by a significant usage of bit-level operations, some of which are not commonly available on General Purpose Processors (GPPs). To circumvent this limitation, an Application-Specific Instruction-set Processor (ASIP) architecture is proposed to accelerate the processing of biological sequences (e.g., alignment, mapping, etc.) using compressed full-text indexes based on the Burrows-Wheeler Transform (BWT). The proposed processor was built over a RISC micro-architecture and extends the Xilinx MicroBlaze ISA with additional bit-level operations, especially tailored for compressed indexes. When used to perform search operations over the considered compressed index, the proposed architecture provides a reduction of the number of required instructions by about one half. Furthermore, when prototyped on a Xilinx Virtex-7 FPGA, the ASIP proved to offer an overall speedup between 3.1x and 4.5x for the execution of a single threaded operation. To ensure a further processing scalability, the proposed ASIP was designed in order to be easily used as the basic processing unit of multi-core systems, especially tuned for the parallel processing of massive datasets of biological reads.", keywords = "Bioinformatics applications; Embedded computational platforms; Processing of biological sequences: alignment, mapping; Application-Specific Instruction-set Processor (ASIP); Compressed full-text indexes; Burrows-Wheeler Transform (BWT); MicroBlaze ISA extension.", isbn = "978-1-4799-5312-7", doi = "10.1109/HPCSim.2014.6903731" } @InProceedings{Aksoy-ISCAS14, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "{ECHO}: A Novel Method for the Multiplierless Design of Constant Array Vector Multiplication", booktitle = ISCAS, publisher = "IEEE", pages = " 1456-1459 ", location = "Melbourne, Australia", month = JUN # " 1--5, ", year = 2014, abstract = "The constant array vector multiplication (CAVM) operation realizes the multiplication of a constant array by a vector of variables and occurs in the design of direct form finite impulse response (FIR) and infinite impulse response (IIR) filters. In this paper, for the first time, we directly target the optimization of the multiplierless design of a CAVM operation and introduce a novel algorithm, called ECHO, that can find the fewest number of adders and subtractors required for its implementation. We also describe some hardware optimization techniques that can reduce the gate-level area and delay of the CAVM design. It is shown that the solutions of ECHO include significantly less number of operations and yield less area in FIR filter designs than those of previously proposed state-of-art algorithms.", keywords = "FIR, IIR filters; Constant Array Vector Multiplication (CAVM); Multiplierless design; Cirucuit/hardware optimization techniques; DSP; Gate-level implementation.", isbn = "978-1-4799-3431-7", doi = "10.1109/ISCAS.2014.6865420", annote = "Poster" } @InProceedings{Aksoy-IWLS14, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "On the Multiplierless Design of Correctly Rounded Multiple Constant Divisions", booktitle = IWLS, pages = " ??--?? ", location = "San Francisco, USA", month = MAY # " 30, ", year = 2014, abstract = "The previously proposed algorithms designed for the constant divisions use the multiply-add architecture which is expensive in terms of area and power dissipation in hardware. This paper introduces the problem of finding the minimum number of adders/subtractors which realize the correctly rounded multiple constant divisions (MCD) under the shift-adds architecture. It proposes a depth-first search method which can explore the whole search space, ensuring the minimum solution. It also presents a local search method which can handle the instances that the exact algorithm cannot cope with due to the NP-completeness of the MCD problem. Both algorithms are equipped with techniques that maximize the sharing of common partial products among the constant multiplications. To the best of our knowledge, these are the first algorithms proposed for the multiplierless design of the correctly rounded MCD. Experimental results include the results of algorithms, indicating that the local search method can find solutions close to the minimum using little computational resources and can obtain better solutions than a state-of-art algorithm. They also include the results of MCD designs under different architectures, showing that the MCD designs under the shift-adds architecture occupy less area and consume less power than those under the multiply-add architecture.", keywords = "Constant division; multiply-add architecture; shift-adds architecture; rounded multiple constant divisions (MCD).", note = PRESENTED* } @InProceedings{Aksoy-DATE14, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Optimization of Design Complexity in Time-Multiplexed Constant Multiplications", booktitle = DATE, publisher = "IEEE", pages = " 1--4 ", location = "Dresden, Germany", month = MAR # " 24--28,", year = 2014, abstract = "The multiplication of constants by a data input is an essential operation in digital signal processing (DSP) systems. For applications requiring a large number of constant multiplications under stringent hardware constraints, it is generally realized under a folded architecture, where a single constant selected from a set of multiple constants is multiplied by the data input at each time, called time-multiplexed constant multiplication (TMCM). This paper addresses the problem of optimizing the complexity of a TMCM design and introduces an algorithm that finds the least complex TMCM design by sharing the logic operators, i.e., adders, subtractors, adders/subtractors, and multiplexors (MUXes). It includes efficient search methods, yielding better results than existing TMCM algorithms.", keywords = "Time-Multiplexed Constant Multiplication (TMCM); Folded architecture; shift-adds architecture optimization; Digital Signal Processing (DSP).", doi = "10.7873/DATE.2014.313", annote = "Poster" } @InProceedings{Aksoy-VLSI-SoC13, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "Towards the Least Complex Time-Multiplexed Constant Multiplication", booktitle = VLSI-SOC, pages = " 328--331 ", location = "Istanbul, Turkey", month = OCT # " 7--9, ", year = 2013, abstract = "The multiplication of a variable by a single constant selected from a set of fixed constants at a time, called the time-multiplexed constant multiplication (TMCM), is frequently used in digital signal processing (DSP) systems. Existing algorithms implement the TMCM operation using multiplexers (MUXes), adders/subtractors, and shifts, and reduce its complexity by merging single/multiple constant multiplication graphs and by sharing the basic structures. This paper introduces ARION, that exploits the most common partial terms in the TMCM design on top of the previously proposed DAGfusion algorithm, which merges the single constant multiplication graphs. Experimental results show that ARION obtains significantly better solutions than prominent TMCM methods.", doi = "10.1109/VLSI-SoC.2013.6673302" } @InProceedings{Aksoy-ECCTD13, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Exploration of Tradeoffs in the Design of Integer Cosine Transforms for Image Compression", booktitle = ECCTD, pages = " 1--4 ", location = "Dresden, Germany", month = SEP # " 8--12, ", year = 2013, abstract = "In image data compression, integer cosine transforms (ICTs) have been preferred to discrete cosine transforms (DCTs) due to their similar transform efficiency and lower implementation cost. However, there exist many alternative ICTs with different performance measures and implementation costs. In this work, we explore all possible ICTs, compute their performance measures, and find their implementation costs in terms of the number of adders/subtractors, where a state-of-art technique is used to realize ICTs under a shift-adds architecture. We also investigate the tradeoff between performance and implementation cost, present the pareto-optimal points of this tradeoff, and introduce promising ICTs that were not considered before.", keywords = "Integer Cosine Transforms (ICT); Discrete Cosine Transforms (DCT); Shift-Adds Architecture; Multiplier-Less Hardware Logic Gates.", doi = "10.1109/ECCTD.2013.6662223" } @InProceedings{Brito-NEWCAS13, author = "Diogo Brito and Jorge Fernandes and Paulo Flores and José Monteiro", title = "Standard {CMOS} Voltage-Mode {QLUT} Using a Clock Boosting Technique", booktitle = NEWCAS, pages = " 1--4 ", location = "Paris, France", month = JUN # " 16--19, ", year = 2013, abstract = "Interconnect has become preponderant in many aspects of digital circuit design, namely delay, power and area. This effect is particularly true for FPGAs, where interconnection is often the most limiting factor. Multiple-valued logic allows to reduce interconnections, within logic cells and between them, hence effectively mitigating the impact of interconnections. In this paper we propose a new look-up table structure based on a low-power high-speed quaternary voltage-mode device. Our quaternary implementation overcomes the drawbacks of previously proposed techniques by using a standard CMOS technology and a clock boosting technique to enhance speed without increasing consumption. Moreover, we present an ASIC prototype of a full adder based on the designed look-up table and experimental results are obtained and compared with simulation. The prototype is designed to work at 100 MHz and it consumes 128 uW.", keywords = "Quaternary Logic, Look-up Table, FPGA, Clock Boosting.", isbn = "978-1-4799-0618-5", doi = "10.1109/NEWCAS.2013.6573573" } @InProceedings{Neves-ASAP13, author = "Nuno Neves and Nuno Sebastião and André Patrício and David Matos and Pedro Tomás and Paulo Flores and Nuno Roma", title = "{BioBlaze}: Multi-Core {SIMD} {ASIP} for {DNA} Sequence Alignment", booktitle = ASAP, publisher = "IEEE", pages = " 241--244 ", location = "Washington D.C., USA", month = JUN # " 5--7, ", year = 2013, abstract = "A new Application-Specific Instruction-set Processor (ASIP) architecture for biological sequences alignment is proposed in this manuscript. This architecture achieves high processing throughputs by exploiting both fine and coarse-grained parallelism. The former is achieved by extending the Instruction Set Architecture (ISA) of a synthesizable processor to include multiple specialized SIMD instructions that implement vector-vector and vector-scalar arithmetic, logic, load/store and control operations. Coarse-grained parallelism is achieved by using multiple cores to cooperatively align multiple sequences in a shared memory architecture, comprising proper hardware-specific synchronization mechanisms. To ease the programming, a compilation framework based on an adaptation of the GCC back-end was also implemented. The proposed system was prototyped and evaluated on a Xilinx Virtex-7 FPGA, achieving a 200MHz working frequency. A sequential and a state-of-the-art SIMD implementations of the Smith-Waterman algorithm were programmed in both the proposed ASIP and an Intel Core i7 processor. When comparing the achieved speedups, it was observed that the proposed ISA achieves a 40x speedup, which contrasts with the 11x speedup provided by SSE2 in the Intel Core i7 processor. The scalability of the multi-core system was also evaluated and proved to scale almost linearly with the number of cores.", issn = "2160-0511", isbn = "978-1-4799-0494-5", doi = "10.1109/ASAP.2013.6567581", annote = "Accepted as a poster." } @InProceedings{Marques-FLAIRS13, author = "Ricardo Marques and Luis Guerra e Silva and Paulo Flores and L. Miguel Silveira.", title = "Improving {SAT} Solver Efficiency using a Multi-core Approach", booktitle = FLAIRS, pages = " 94--99 ", location = "Florida, USA", month = MAY # " 22--24, ", year = 2013, abstract = "Many practical problems in multiple fields can be converted to a SAT problem, or a sequence of SAT problems, such that their solution immediately implies a solution to the original problem. Despite the enormous progress achieved over the last decade in the development of SAT solvers, there is strong demand for higher algorithm efficiency to solve harder and larger problems. The widespread availability of multi-core, shared memory parallel environments provides an opportunity for such improvements. In this paper we present our results on improving the effectiveness of standard SAT solvers on such architectures, through a portfolio approach. Multiple instances of the same basic solver using different heuristic strategies, for search-space exploration and problem analysis, share information and cooperate towards the solution of a given problem. Results from the application of our methodology to known problems from SAT competitions show relevant improvements over the state of the art and yield the promise of further advances." } @InProceedings{Aksoy-GLSVLSI13, author = "Levent Aksoy and Paulo Flores and José Monteiro", title = "{SIREN}: A Depth-First Search Algorithm for the Filter Design Optimization Problem", booktitle = GLSVLSI, pages = " 179--184 ", location = "Paris, France", month = MAY # " 2--3, ", year = 2013, abstract = "This paper addresses the filter design optimization (FDO) problem that is to find a set of filter coefficients which yields the least design complexity while meeting the required filter constraints. The design complexity of a filter is defined in terms of the total number of adders/subtracters, assuming that the multiplication of coefficients by the filter input is realized under a shift-adds architecture. Existing algorithms use efficient search methods, but none of them can guarantee the minimum design complexity. Hence, we propose an exact algorithm, called SIREN, that finds an optimum solution of the FDO problem under the minimum quantization value. It is based on a depth-first search method equipped with an exact technique, that finds the minimum number of adders/subtracters in the multiplier block of the filter, and search pruning techniques that enable it to be applicable to practical instances. Experimental results show that SIREN can still find better solutions than efficient FDO algorithms and its solutions lead to filters with significantly less area when compared to a straightforward filter design technique.", keywords = "Filter design optimization; Depth-first search; Linear programming; Multiplierless filter design.", isbn = "978-1-4503-2032-0", doi = "10.1145/2483028.2483087" } @InProceedings{Brito-ICECS12, author = "Diogo Brito and Jorge Fernandes and Paulo Flores and José Monteiro", title = "Design and Characterization of a {QLUT} in a Standard {CMOS} Process", booktitle = ICECS, pages = " 288--291 ", location = "Seville, Spain", month = DEC # " 9--12, ", year = 2012, abstract = "Interconnect has become preponderant in many aspects of circuit design, namely delay, power and area. This effect is particularly true for FPGAs, where interconnect is often the most limiting factor. Quaternary logic offers a means to reduce interconnect since each circuit wire can, in principle, carry the same information as two binary wires. We have proposed in [1] a design implementing a quaternary low-power highspeed look-up table. The main features of this circuit are being based on a voltage-mode structure and using only standard CMOS technology. In this paper we present the design of a prototype implementation and experimental results. These results are discussed and conclusions are drawn that provide further guidelines for improvement.", keywords = "Multi-value logic; Quaternary circuits; Circuit Layout, Fabrication and Test.", isbn = "978-1-4673-1261-5 (Print) 978-1-4673-1259-2 (Online)", doi = "10.1109/ICECS.2012.6463744" } @InProceedings{Aksoy-ICCAD12-RefD, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Multiple Tunable Constant Multiplications: Algorithms and Applications", booktitle = ICCAD, pages = " 473--479 ", location = "San José, CA, USA", month = NOV # " 5--8, ", year = 2012, abstract = "The multiple constant multiplications (MCM) problem, that is defined as finding the minimum number of addition and subtraction operations required for the multiplication of multiple constants by an input variable, has been the subject of great interest since the complexity of many digital signal processing (DSP) systems is dominated by an MCM operation. This paper introduces a variant of the MCM problem, called multiple tunable constant multiplications (MTCM) problem, where each constant is not fixed as in the MCM problem, but can be selected from a set of possible constants. We present an exact algorithm that formalizes the MTCM problem as a 0-1 integer linear programming (ILP) problem when constants are defined under a number representation. We also introduce a local search method for the MTCM problem that includes an efficient MCM algorithm. Furthermore, we show that these techniques can be used to solve various optimization problems in finite impulse response (FIR) filter design and we apply them to one of these problems. Experimental results clearly show the efficiency of the proposed methods when compared to prominent algorithms designed for the MCM problem", keywords = "MTCM - Multiple Tunable Constant Multiplications; 0-1 ILP; Exact algorithm; Local search algorithm.", issn = "1092-3152", doi = "10.1145/2429384.2429482" } @InProceedings{Aksoy-DATE12, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Design of Low-Complexity Digital Finite Impulse Response Filters on {FPGA}s", booktitle = DATE, pages = " 1197--1202 ", location = "Dresden, Germany", month = MAR # " 12--16, ", year = 2012, abstract = "The multiple constant multiplications (MCM) operation, which realizes the multiplication of a set of constants by a variable, has a significant impact on the complexity and performance of the digital finite impulse response (FIR) filters. Over the years, many high-level algorithms and design methods have been proposed for the efficient implementation of the MCM operation using only addition, subtraction, and shift operations. The main contribution of this paper is the introduction of a high-level synthesis algorithm that optimizes the area of the MCM operation and, consequently, of the FIR filter design, on field programmable gate arrays (FPGAs) by taking into account the implementation cost of each addition and subtraction operation in terms of the number of fundamental building blocks of FPGAs. It is observed from the experimental results that the solutions of the proposed algorithm yield less complex FIR filters on FPGAs with respect to those whose MCM part is implemented using prominent MCM algorithms and design methods.", keywords = "FIR filter design;FPGA;MCM algorithms;field programmable gate arrays;high-level algorithms;low-complexity digital finite impulse response filters design;multiple constant multiplications operation;operations level synthesis algorithm;shift operations;FIR filters;field programmable gate arrays;high level synthesis.", issn = "1530-1591", isbn = "978-1-4577-2145-8", doi = "10.1109/DATE.2012.6176675" } @InProceedings{Aksoy-VLSI-SoC11, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "A Hybrid Algorithm for the Optimization of Area and Delay in Linear {DSP} Transforms", booktitle = VLSI-SOC, pages = " 148--153 ", location = "Hong Kong, China", month = OCT # " 3--5,", year = 2011, abstract = "This paper addresses the problem of multiplierless realization of linear transforms using the fewest number of addition and subtraction operations and introduces a hybrid algorithm that incorporates a graph-based technique, called the difference method, and a Common Subexpression Elimination (CSE) algorithm. In the proposed algorithm, while the difference method extracts the most promising realizations of linear transforms in each iteration, the CSE algorithm achieves the most common minimum conflicting subexpressions in each solution of the difference method. This paper also describes how the hybrid algorithm can be modified in order to find a solution with the fewest number of operations under a delay constraint. The experimental results on a comprehensive set of instances show the efficiency of the hybrid algorithms, at both high-level and gate-level, in comparison to previously proposed algorithms.", keywords = "Multiplierless Linear Transforms, Hybrid algorithms, Graph-based techniques, Common Subexpression Elimination (CSE), High-level and gate-level optimization.", isbn = "978-1-4577-0171-9 / 978-1-4577-0169-6", doi = "10.1109/VLSISoC.2011.6081637" } @InProceedings{Aksoy-ECCTD11, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Optimization of Gate-Level Area in High Throughput Multiple Constant Multiplications", booktitle = ECCTD, publisher = "IEEE", pages = " 588--591 ", location = "Linköping, Sweden", month = AUG # " 29--31, ", year = 2011, abstract = "This paper addresses the problem of optimizing gate-level area in a pipelined Multiple Constant Multiplications (MCM) operation and introduces a high-level synthesis algorithm, called HCUB-DC+ILP. In the HCUB-DC+ILP algorithm, initially, a solution with the fewest number of operations under a minimum delay constraint is found by the Hcub-DC algorithm. Then, the area around this local minimum point is explored exactly using a 0-1 Integer Linear Programming (ILP) technique that considers the gate-level implementation of the pipelined MCM operation. The experimental results at both high-level and gate-level clearly show the efficiency of HCUB-DC+ILP over previously proposed prominent MCM algorithms.", keywords = "Algorithm Design and Analysis, Optimization, Pipeline Processing, Signal Processing Algorithms, Pipelined Multiple Constant Multiplications (MCM), Integer Linear Programming (ILP), Gate-Level Optimization, HCUB-DC+ILP and Hcub-DC algorithm.", isbn = "978-1-4577-0617-2 / 978-1-4577-0169-6", doi = "10.1109/ECCTD.2011.6043602" } @InProceedings{Aksoy-ISCAS11, author = "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and Paulo Flores and José Monteiro", title = "Optimization of Area in Digit-Serial Multiple Constant Multiplications at Gate-Level", booktitle = ISCAS, publisher = "IEEE", pages = " 2737-- 2740 ", location = "Rio de Janeiro, Brazil", month = MAY # " 15--18, ", year = 2011, abstract = "The last two decades have seen many efficient algorithms and architectures for the design of low-complexity bit-parallel Multiple Constant Multiplications (MCM) operation, that dominates the complexity of Digital Signal Processing (DSP) systems. On the other hand, digit-serial architectures offer alternative low-complexity designs, since digit-serial operators occupy less area and are independent of the data wordlength. This paper introduces the problem of designing a digit-serial MCM operation with minimal area at gate-level and presents the exact formalization of the area optimization problem as a 0-1 Integer Linear Programming (ILP) problem. Experimental results show the efficiency of the proposed algorithm and digit-serial MCM designs in terms of area at gate-level.", keywords = "Multiple Constant Multiplications (MCM), Digit-Serial Multiple Constant Multiplication operation, Low-complexity design, Area optimization, 0-1 Integer Linear Programming problem, Area optimization problem, Gate-level optimization.", issn = "0271-4302", isbn = "978-1-4244-9473-6 / 978-1-4244-9472-9", doi = "10.1109/ISCAS.2011.5938171", annote = "Accepted as poster" } @InProceedings{Aksoy-GLSVLSI11-mcmSerial, author = "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and Paulo Flores and José Monteiro", title = "Efficient Shift-Adds Design of Digit-Serial Multiple Constant Multiplications", booktitle = GLSVLSI, publisher = "ACM", pages = " 61--66 ", location = "Lausanne, Switzerland", month = MAY # " 2--4, ", year = 2011, abstract = "The bit-parallel realization of the multiplication of a variable by a set of constants using only addition/subtraction and shift operations has been explored extensively over the years, as large number of constant multiplications dominate the complexity of many Digital Signal Processing (DSP) systems. On the other hand, digit-serial architectures offer alternative low-complexity designs, since digit-serial operators occupy less area and are independent of the data wordlength. This paper introduces an approximate algorithm that targets the optimization of the gate-level area of digit-serial constant multiplications under a shift-adds architecture. Experimental results on a comprehensive set of instances indicate that the approximate algorithm presented in this paper gives better solutions than the previously proposed algorithms in terms of area at gate-level and yields alternative low-complexity designs relatively to the bit-parallel design. Furthermore, it is observed on the digit-serial filter designs that the use of shift-adds architecture yields area reduction up to 43.6\% with respect to the filter designs that use generic digit-serial constant multipliers.", keywords = "Multiple constant multiplications (MCM), digit-serial arithmetic, gate-level area optimization, finite impulse response (FIR) filters.", isbn = "978-1-4503-0667-6", doi = "10.1145/1973009.1973023", annote = "Tipo A no IST" } @InProceedings{Aksoy-GLSVLSI11-mcmLowpower, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Design of Low-Power Multiple Constant Multiplications Using Low-Complexity Minimum Depth Operations", booktitle = GLSVLSI, publisher = "ACM", pages = " 79--84 ", location = "Lausanne, Switzerland", month = MAY # " 2--4, ", year = 2011, abstract = "Existing optimization algorithms for the multiplierless realization of multiple constant multiplications (MCM) typically target the minimization of the number of addition and subtraction operations. Since power dissipation is directly related to the amount of hardware, some power reduction is indirectly achieved by these algorithms. However, in many cases, glitching plays an equally important role in defining the power consumption. This is specially true for arithmetic circuits, and in particular to MCM due to high logic depth and large number of re-convergent paths. This paper introduces exact algorithms that search the optimal area of an MCM design at gate-level where each constant multiplication is implemented in its minimum depth. Experimental results show that the proposed algorithms lead to MCM designs consuming significantly less power with respect to those obtained by the MCM algorithms.", keywords = "Multiple constant multiplications (MCM), 0-1 integer linear programming, high-level synthesis, low-power MCM design.", isbn = "978-1-4503-0667-6", doi = "10.1145/1973009.1973026", annote = "Tipo A no IST" } @InProceedings{Jaccottet-VLSI-SoC10, author = "Diego Jaccottet and Eduardo Costa and Levent Aksoy and Paulo Flores and José Monteiro", title = "Design of Low-Complexity and High-Speed Digital Finite Impulse Response Filters", booktitle = VLSI-SOC, pages = " 292--297 ", location = "Madrid, Spain", month = SEP # " 27--29,", year = 2010, abstract = "In this paper, we introduce a design methodology to implement low-complexity and high-speed digital Finite Impulse Response (FIR) filters. Since FIR filters suffer from a large number of constant multiplications, in the proposed method the constant multiplications are replaced by addition/subtraction and shift operations. Also, based on the design objective, i.e., low-complexity or high-speed, the addition/subtraction operations are implemented using Ripple Carry Adder (RCA) or Carry-Save Adder (CSA) architectures respectively. Furthermore, high-level algorithms designed for the optimization of the number of RCA and CSA blocks are used to reduce the complexity of the FIR filter. Thus, a Computer-Aided Design (CAD) tool that synthesizes low-complexity and high-speed FIR filters in a shift-adds architecture is developed. It is observed from the experimental results on FIR filter instances that the developed CAD tool can find better FIR filter designs in terms of area and delay than those obtained using efficient general multipliers.", keywords = "Multiple constant multiplications, addition and subtraction architectures, low complexity FIR filter, ripple carry adder, carry saver adder.", isbn = "978-1-4244-6469-2", doi = "10.1109/VLSISOC.2010.5642676" } @InProceedings{Aksoy-DSD10, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and José Monteiro", title = "Optimization of Area and Delay at Gate-Level in Multiple Constant Multiplications", booktitle = DSD, pages = " 3--10 ", location = "Lille, France", month = SEP # " 1--3,", year = 2010, abstract = "Although many efficient high-level algorithms have been proposed for the realization of Multiple Constant Multiplications (MCM) using the fewest number of addition and subtraction operations, they do not consider the low-level implementation issues that directly affect the area, delay, and power dissipation of the MCM design. In this paper, we initially present area efficient addition and subtraction architectures used in the design of the MCM operation. Then, we propose an algorithm that searches an MCM design with the smallest area taking into account the cost of each operation at gate-level. To address the area and delay tradeoff in MCM design, the proposed algorithm is improved to find the smallest area solution under a delay constraint. The experimental results show that the proposed algorithms yield low-complexity and high-speed MCM designs with respect to those obtained by the prominent algorithms designed for the optimization of the number of operations and the optimization of area at gate-level.", keywords = "Multiple constant multiplications, addition and subtraction architectures, delay aware area optimization, gate-level area optimization, graph-based algorithms.", isbn = "78-1-4244-7839-2", doi = "10.1109/DSD.2010.32" } @InProceedings{Sebastiao-HPCS10, author = "Nuno Sebastião and Tiago Dias and Nuno Roma and Paulo Flores", title = "Integrated Accelerator Architecture for {DNA} Sequences Alignment with Enhanced Traceback Phase", booktitle = HPCS, pages = " 16--23", location = "Caen, France", month = JUN # " 28--" # JUL # " 2, ", year = 2010, abstract = "Dynamic programming algorithms are widely used to find the optimal sequence alignment between any two DNA sequences. This paper presents an innovative technique to significantly reduce the computation time and memory space requirements of the traceback phase of the Smith-Waterman algorithm, together with a flexible and scalable hardware architecture to accelerate the overall procedure. The results obtained from an implementation using a Virtex-4 FPGA showed that the proposed technique is feasible and is able to provide a significant speedup. For the considered test sequences, a speedup of about 6000 was obtained.", keywords = "DNA, Hardware Accelerator, Local Sequence Alignment, Traceback.", isbn = "978-1-4244-6827-0", doi = "10.1109/HPCS.2010.5547154" } @InProceedings{Lazzari-ISCAS10, author = "Cristiano Lazzari and Paulo Flores and José Monteiro and Luigi Carro", title = "Voltage-mode Quaternary {FPGA}s: An Evaluation of Interconnections", booktitle = ISCAS, pages = " 869--972 ", location = "Paris, France", month = MAY # " 30--" # JUN # " 2, ", year = 2010, abstract = "This work presents a study about FPGA interconnections and evaluates their effects on voltage-mode binary and quaternary FPGA structures. FPGAs are widely used due to the fast time-to-market and reduced non-recurring engineering costs in comparison to ASIC designs. Interconnections play a crucial role in modern FPGAs, because they dominate delay, power and area. The use of multiple-valued logic allows the reduction of the number of signals in the circuit, hence providing a mean to effectively curtail the impact of interconnections. The most important characteristic of the results are the reduced fanout, fewer number of wires and the smaller wire length presented by the quaternary devices. We use a set of arithmetic circuits to compare binary and quaternary implementations. This work presents the first step on developing quaternary circuits by mapping any binary random logic onto quaternary devices.", keywords = "FPGAs, Mutiple-valued Logic, Low-power Quaternary Voltage-mode Device, Interconnectios, Arithmetic Circuits.", isbn = "978-1-4244-5308-5", doi = "10.1109/ISCAS.2010.5537423" } @InProceedings{Lazzari-DATE10, author = "Cristiano Lazzari and Paulo Flores and José Monteiro and Luigi Carro", title = "A New Quaternary {FPGA} Based on a Voltage-mode Multi-valued Circuit", booktitle = DATE, pages = " 1797--1802 ", location = "Dresden, Germany", month = MAR # " 8--12, ", year = 2010, abstract = "FPGA structures are widely used due to early time-to-market and reduced non-recurring engineering costs in comparison to ASIC designs. Interconnections play a crucial role in modern FPGAs, because they dominate delay, power and area. Multiple-valued logic allows the reduction of the number of signals in the circuit, hence can serve as a mean to effectively curtail the impact of interconnections. In this work we propose a new FPGA structure based on a low-power quaternary voltage-mode device. The most important characteristics of the proposed architecture are the reduced fanout, low number of wires and switches, and the small wire length. We use a set of FIR filters as a demonstrator of the benefits of the quaternary representation in FPGAs. Results show a significant reduction on power consumption.", keywords = "FPGAs, Mutiple-valued Logic, Low-power Quaternary Voltage-mode Device, FIR filters.", issn = "1530-1591", isbn = "978-1-4244-7054-9", doi = "10.1109/DATE.2010.5457105" } @InProceedings{Lazzari-SCS09, author = "Cristiano Lazzari and Paulo Flores and José Carlos Monteiro", title = "Power and Delay Comparison of Binary and Quaternary Arithmetic Circuits", booktitle = SCS, pages = " 1--6", location = "Jerba, Tunisia", month = NOV # " 6--8, ", year = 2009, abstract = "Interconnections play a crucial role in deep sub-micron designs because they dominate the delay, power and area. This is especially critical for modern million-gates FPGAs, where as much as 90\% of chip area is devoted to interconnections. Multiple-valued logic allows for the reduction of the required number of signals in the circuit, hence can serve as a means to effectively curtail the impact of interconnections. We present in this paper a comparison of binary and quaternary implementations of arithmetic modules based on lookup table structures using a voltage-mode circuits. Our assessment demonstrates that significant power reduction is possible through the use of quaternary structures, with very low delay penalties.", keywords = "Quaternary Logic, Arithmetic Circuits, FPGA Synthesis, Delay and Power Consumption.", isbn = "978-1-4244-4397-0", doi = "10.1109/ICSCS.2009.5412586" } @InProceedings{Aksoy-ICC09, author = "Levent Aksoy and Ece Olcay Gunes and Paulo Flores", title = "Optimization of Area under a Delay Constraint in Multiple Constant Multiplications", booktitle = ICC, publisher = "World Scientific and Engineering Academy and Society (WSEAS)", pages = " 81--86", location = "Rodos, Greece", month = JUL # " 22--24, ", year = 2009, abstract = "The Multiple Constant Multiplications (MCM), i.e., the multiplication of a variable by a set of constants, has been a central operation and performance bottleneck in many digital signal processing applications such as, image and video processing, digital television, and wireless communications. Since the design of multiplications is expensive in terms of area, delay, and power consumption in hardware, the area-delay optimization of the MCM operation has often been accomplished by using the shift-adds architecture. However, most of the previously proposed algorithms have focused on the optimization of area ignoring the crucial tradeoff between area and delay of the computation. In this paper, we introduce an approximate algorithm that can find near optimal area solutions under the user specified delay constraint. It is shown by the experimental results that the proposed algorithm finds better area-delay solutions than the previously proposed efficient algorithms.", keywords = "Multiple constant multiplications, area and delay optimization, high-level synthesis, digital FIR filters.", issn = "790-5117", isbn = "978-960-474-096-3" } @InProceedings{Aksoy-NORCHIP08, author = "Levent Aksoy and Ece Olcay Gunes and Paulo Flores", title = "An Exact Breadth-First Search Algorithm for the Multiple Constant Multiplications Problem", booktitle = NORCHIP, pages = " 41--46 ", location = "Tallinn, Estonia", month = NOV # " 16--17, ", year = 2008, abstract = "This paper addresses the multiplication of one data sample with multiple constants using addition/subtraction and shift operations, i.e., the multiple constant multiplications (MCM) problem. The MCM problem finds itself and its variants in many applications, such as digital finite impulse response (FIR) filters, linear signal transforms, and computer arithmetic. Although many efficient algorithms have been proposed to implement the MCM using the fewest number of operations, they have been heuristics, i.e., they cannot guarantee the minimum solution. In this work, we propose an exact algorithm based on the exhaustive search that finds the minimum number of operations solution of an MCM problem. We have tested the exact algorithm on a set of instances including FIR filter and randomly generated instances, and compared with the previously proposed efficient heuristics. It is observed from the experimental results that, even though the exact algorithm can be applied only on less complex instances, it finds better solutions than the prominent heuristics when applied on real size instances", keywords = "Multiple constant multiplications problem, exact algorithm, breadth-first exhaustive search.", isbn = "978-1-4244-2492-4", doi = "10.1109/NORCHP.2008.4738280", annote = "Submitted ICCAD 2008-04-14, Rejected 2008-06-27" } @InProceedings{Daitx-SCS08, author = "Fábio Daitx and Vagner Rosa and Eduardo Costa and Paulo Flores and Sérgio Bampi", title = "{VHDL} Generation of Optimized {FIR} Filters", booktitle = SCS, pages = " 1--5", location = "Hammamet, Tunisia", month = NOV # " 7--9, , ", year = 2008, abstract = "This work proposes an VHDL generation software for optimized FIR filters. In this paper a near optimum algorithm for constant coefficient FIR filters was used. This algorithm uses general coefficient representation for the optimal sharing of partial products in Multiple Constants Multiplications (MCM). The developed tool was compared to Matlab FDA toolbox. Synthesis results show that our tool is able to produce significantly better hardware than FDA toolbox, doubling the speed and reducing the silicon area by 75\%. The software produces a generic VHDL output, synthesizable to ASIC or FPGA.", keywords = "FIR filters; MCM optimization; FPGA implementation.", isbn = "978-1-4244-2627-0", doi = "10.1109/ICSCS.2008.4746944", url = "http://www.emc-lab.net/Conferences/SCS2008/index.php", annote = "Rejected-ISCAS08+VLSISoC07, Submitted-CSC08 on 2008-08-10" } @InProceedings{Morgado-PATMOS08, author = "P. Marques Morgado and Paulo F. Flores and Jos{\'e} C. Monteiro and L. Miguel Silveira", title = "Generating Worst-Case Stimuli for Accurate Power Grid Analysis", booktitle = PATMOS, editor = "Lars Svebsson and José Monteiro", publisher = SPRINGER, pages = " 247--257 ", location = "Lisbon, Portugal", month = SEP # " 12--14, ", year = 2008, abstract = "Power distribution systems provide the voltages and currents that devices in a circuit need to operate properly and silicon success requires its careful design and verification. However, problems like voltage drop, ground bounce and electromigration which may cause chip failures, are worsening, as more devices, operating at higher frequencies, are placed closer together. Verification of this type of systems is usually done by simulation, a costly endeavor given the size of current grids, making the determination of the worst-case input setting a crucial task. Current methodologies are based on supposedly safe settings targeting either unrealistic simultaneous switching on all signals or heuristic accounts of the joint switching probability of nearby signals. In this paper we propose a methodology for computation of the worst-case stimuli for power grid analysis. This is accomplished by determining the input vector that maximizes the number of gates in close proximity to each other that can switch in a given time window. The addition of these temporal and spatial restrictions makes the solution of the underlying optimization problem feasible. Comparisons with existing alternatives show that only a fraction of the gates change in any given timing window, leading to a more robust and efficient verification methodology.", keywords = "Power Grid Analysis; Wort-case Stimuli.", issn = "0302-9743", isbn = "978-3-540-95947-2" } @InProceedings{Sebastiao-DSD08, author = "Nuno Sebasti{\~a}o and Tiago Dias and Nuno Roma and Paulo Flores and Leonel Sousa", title = "Application Specific Programmable {IP} Core for Motion Estimation: Technology Comparison Targeting Efficient Embedded Co-Processing Units", booktitle = DSD, pages = " 181--188 ", location = "Parma, Italy", month = SEP # " 3--5, ", year = 2008, abstract = "The implementation of a recently proposed IP core of an efficient motion estimation co-processor is considered. Some significant functional improvements to the base architecture are proposed, as well as the presentation of a detailed description of the interfacing between the coprocessor and the main processing unit of the video encoding system. Then, a performance analysis of two distinct implementations of this IP core is presented, considering two different target technologies: a high performance FPGA device, from the Xilinx Virtex-II Pro family, and an ASIC based implementation, using a 0.18um CMOS StdCell library. Experimental results have shown that the two alternative implementations have quite similar performance levels and allow the estimation of motion vectors in real-time", keywords = "Motion Estimation IP core; Technology Comparison; Embedded Co-Processing.", isbn = "978-0-7695-3277-6", doi = "10.1109/DSD.2008.66" } @InProceedings{Sebastiao-ACACES08, author = "Nuno Sebasti{\~a}o and Tiago Dias and Nuno Roma and Paulo Flores and Leonel Sousa", title = "Specialized Motion Estimation Processor for Heterogeneous Multicore Video Coding Systems", booktitle = "International Summer School on Advanced Computer Architecture and Compilation for Embedded Systems (ACACES) Poster Abstracts", pages = " 63--66 ", location = "L'Aquila, Italy", month = JUL # " 13--19, ", year = 2008, abstract = "In the last few years, several highly efficient processor architectures (e.g.: ARM, Power-PC, etc.) have been proposed to implement computational demanding multimedia applications. This paper proposes to use these capabilities to extend the main processor computational resources, in order to efficiently execute one of the most critical parts of the video coding algorithm: motion estimation. This extension is accomplished by embedding a specialized Application Specific Instruction Set Processor (ASIP) core for motion estimation that works together with a General Purpose Processor in a heterogeneous multicore environment.", keywords = "Motion Estimation; Multicore; Video Encoding.", isbn = "978-9-038-21288-3", annote = "Summer School: short paper + poster" } @InProceedings{Aksoy-SIPS07, author = "Levent Aksoy and Ece Olcay Gunes and Eduardo Costa and Paulo Flores and José Monteiro", title = "Effect of Number Representation on the Achievable Minimum Number of Operations in Multiple Constant Multiplications", booktitle = SIPS, pages = " 424--429 ", location = "Shangai, China", month = OCT # " 17--19, ", year = 2007, abstract = "In this work, we analyze the effect of representing constants under binary, CSD, and MSD representations on the minimum number of operations required in a multiple constant multiplications problem. To this end, we resort to a recently proposed algorithm that computes the exact minimum solution. To extend the applicability of this algorithm to much larger instances, we propose problem reduction and model simplification techniques that significantly reduce the search space. We have conducted experiments on a rich set of instances including randomly generated and FIR filter instances. The results show that, contrary to common belief, the binary representation clearly yields better solutions than CSD, and even provides slightly better solutions than MSD. Moreover, the superiority of the binary solutions increases as the number and bit-width of the constants increase.", keywords = "Canonical Signed Digit (CSD), Common Subexpression Elimination (CSE), Minimal Signed Digit (MSD), Multiple Constant Multiplication (MCM).", issn = "1520-6130", isbn = "978-1-4244-1222-8", doi = "10.1109/SIPS.2007.4387585", annote = "IF=, Citations(Scholar)=1, Citations(ISI)=, " } @InProceedings{Aksoy-ECCTD07, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Minimum Number of Operations under a General Number Representation for Digital Filter Synthesis", booktitle = ECCTD, pages = " 252--255 ", location = "Seville, Spain", month = AUG # " 26--30, ", year = 2007, abstract = "In this work, we introduce an algorithm for the optimization of the number of operations in the multiplier block of a digital filter based on a general number representation for the coefficients. In common subexpression elimination algorithms, constants are generally represented with the minimum number of non-zero digits based on their CSD, or MSD representations. We observe that these representations may yield a solution far from the minimum. The general number representation used in our algorithm considers a much larger set of alternative implementations of a constant, which includes the CSD and MSD representations. To cope with the increased search space, we propose model simplification and problem reduction techniques. In this paper, we show that the proposed exact algorithm using general number representation achieves a significant reduction in the number of operations, which can be up to 15\% with respect to the solutions obtained under MSD representation.", isbn = "1-4244-1342-7", doi = "10.1109/ECCTD.2007.4529584", annote = "IF=, Citations(Scholar)=3, Citations(ISI)=, " } @InProceedings{Aksoy-DAC07, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Optimization of Area in Digital {FIR} Filters using Gate-Level Metrics", booktitle = DAC, publisher = "ACM Press", address = "New York, NY, USA", pages = " 420--423 ", location = "San Diego, California", month = JUN # " 4--8, ", year = 2007, abstract = "In the paper, we propose a new metric for the minimization of area in the generic problem of multiple constant multiplications, and demonstrate its effectiveness for digital FIR filters. Previous methods use the number of required additions or subtractions as a cost function. We make the observation that not all of these operations have the same design cost. In the proposed algorithm, a minimum area solution is obtained by considering area estimates for each operation. To this end, we introduce accurate hardware models for addition and subtraction operations in terms of gate-level metrics, under both signed and unsigned representations. Our algorithm not only computes the best design solution among those that have the same number of operations, but is also able to find better area solutions using a non-minimum number of operations. The results obtained by the proposed exact algorithm are compared with the results of the exact algorithm designed for the minimum number of operations on FIR filters and it is shown that the area of the design can be reduced by up to 18\%. General Terms Algorithms, design.", keywords = "FIR, Multiple Constant Multiplication, Area Optimization.", issn = "0738-100X", isbn = "978-1-59593-627-1", doi = "http://doi.acm.org/10.1145/1278480.1278588", annote = "IF=, Citations(Scholar)=4, Citations(ISI)=, " } @InProceedings{Morgado-ISVLSI07, author = "P. Marques Morgado and Paulo F. Flores and L. Miguel Silveira", title = "Generating Realistic Stimuli for Accurate Power Grid Analysis", booktitle = ISVLSI, pages = " 233--238 ", location = "Porto Alegre", month = MAR # " 9--11, ", year = 2007, abstract = "Power distribution systems in integrated circuits are used to provide the voltages and currents the devices need to operate properly. As the semiconductor industry moves into deep nanometer nodes, problems like voltage drop, ground bounce and electromigration which may cause chip failures, are worsening, as more devices, operating at higher frequencies are placed closer together. Verification of a power distribution system is therefore paramount to silicon success. This type of verification is usually done by simulation, targeting a worst-case scenario, typically characterized by the almost simultaneous switching of several devices in the circuit. The definition of the worst-case situation is therefore crucial, since it influences the result of the simulation and ultimately the design target. Supposedly safe but unrealistic settings such as assuming that all signals switch simultaneously, could lead to costly over-designs in terms of die area. In this paper we describe a software tool that generates a reasonable, realistic, worst-case set of stimuli for simulation, based on timing and spatial restrictions that arise from the circuit's netlist and placement. Generating such stimuli is akin to performing a standard static timing analysis, as is done before signoff so the tool fits well within conventional design frameworks. The resulting stimuli indicates that only a fraction of the gates change in any given timing window, leading to a more robust verification methodology, specially in the dynamic case.", keywords = "Circuit CAD, Formal Verification, Integrated Circuit Design, Power Distribution, Power Grids, Software Tools, Timing.", isbn = "0-7695-2896-1", doi = "10.1109/ISVLSI.2007.47", annote = "IF=, Citations(Scholar)=, Citations(ISI)=, " } @InProceedings{Aksoy-ICECS06, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro", title = "{ASSUME}s: Heuristic Algorithms for Optimization of Area and Delay in Digital Filter Synthesis", booktitle = ICECS, pages = " 748--751 ", month = DEC # " 10--13, ", year = 2006, abstract = "In this work two heuristic algorithms are presented for the problems of optimization of area and optimization of area under a delay constraint in digital filter synthesis. The heuristics search for a solution on a combinational network that represents a covering problem using a greedy method for partial term selection. The methods start from the outputs towards the inputs for each coefficient. This top-down approach considers a much larger solution space than existing bottom-up heuristic algorithms. We present results on a wide range of instances and compare them with exact and prominent heuristic algorithms. The results demonstrate that the solutions obtained by the proposed heuristics are extremely close to the exact solutions and are significantly better than the existing heuristic algorithms.", isbn = "1-4244-0395-2", doi = "10.1109/ICECS.2006.379897", annote = "IF=, Citations(Scholar)=1, Citations(ISI)=, " } @InProceedings{Costa-SBCCI06, author = "Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Exploiting General Coefficient Representation for the Optimal Sharing of Partial Products in {MCM}s", booktitle = SBCCI, publisher = "ACM Press", address = "New York, NY, USA", pages = " 161--166 ", location = "Ouro Preto, MG, Brazil", month = AUG # " 28--31, ", year = 2006, abstract = "We propose a new algorithm that maximizes he sharing of partial terms in Multiple Cons an Multiplication (MCM) operations under a general number representation for the coefficients. MCM operations are required by many algorithms in digital signal processing and have been the subject of extensive research. By making no assumptions as to the number representation, the algorithm described in his paper is able to perform a better search for the optimal sharing of partial terms than previous methods based on MSD or CSD representations. We have applied our algorithm for the hardware minimization of FIR filers.The results show that we can ob ain solutions that require between 20\% to 50\% less hardware when compared agains he solutions using he MSD representation.", keywords = "Common Subexpression Elimination (CSE), Digital Filter Design, Minimal Signed Digit (MSD), Multiple Constant Multiplication (MCM).", isbn = "1-59593-479-0", doi = "http://doi.acm.org/10.1145/1150343.1150387", annote = "IF=, Citations(Scholar)=1, Citations(ISI)=, " } @InProceedings{Aksoy-DAC06, author = "Levent Aksoy and Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Optimization of Area under a Delay Constraint in Digital Filter Synthesis Using {SAT}-Based Integer Linear Programming", booktitle = DAC, publisher = "ACM Press", address = "New York, NY, USA", pages = " 669--674 ", location = "San Francisco, CA, USA", month = JUL # " 10--14, , ", year = 2006, abstract = "In this paper, we propose an exact algorithm for the problem of area optimization under a delay constraint in the synthesis of multiplierless FIR filters. To the best of our knowledge, the method presented in this paper is the only exact algorithm designed for this problem. We present the results of the algorithm on real-sized filter instances and compare with an improved version of a recently proposed exact algorithm designed for the minimization of area. We show that in many cases delay can be minimized without any area penalty. Additionally, we describe two approximate algorithms that can be applied to instances which cannot be solved, or take too long, with the exact algorithm. We show that these algorithms find similar solutions to the exact algorithm in less CPU time.", issn = "0738-100X", isbn = "1-59593-381-6", doi = "http://doi.acm.org/10.1145/1146909.1147079", annote = "IF=, Citations(Scholar)=2, Citations(ISI)=, " } @InProceedings{Flores-ICCAD05, author = "Paulo Flores and Jos{\'e} Monteiro and Eduardo Costa", title = "An Exact Algorithm for the Maximal Sharing of Partial Terms in Multiple Constant Multiplication", booktitle = ICCAD, publisher = "IEEE Computer Society", address = "Washington, DC, USA", pages = " 13--16 ", location = "San Jose, CA", month = NOV # " 6--10, ", year = 2005, abstract = "In this paper, we propose an exact algorithm that maximizes the sharing of partial terms in multiple constant multiplication (MCM) operations. We model this problem as a Boolean network that covers all possible partial terms which may be used to generate the set of coefficients in the MCM instance. The PIs to this network are shifted versions of the MCM input. An AND gate represents an adder or a subtracter, i.e., an AND gate generates a new partial term. All partial terms that have the same numerical value are ORed together. There is a single output which is an /spl and/ over all the coefficients in the MCM. We cast this problem into a 0-1 integer linear programming (ILP) problem by requiring that the output is asserted while minimizing the total number of AND gates that evaluate to one. A SAT-based solver is used to obtain the exact solution. We argue that for many real problems the size of the problem is within the capabilities of current SAT solvers. We present results using binary, CSD and MSD representations. Two main conclusions can be drawn from the results. One is that, in many cases, existing heuristics perform well, computing the best solution, or one close to it. The other is that the flexibility of the MSD representation does not have a significant impact in the solution obtained.", isbn = "0-7803-9254-X", doi = "10.1109/ICCAD.2005.1560032", annote = "IF=, Citations(Scholar)=20, Citations(ISI)=, " } @InProceedings{Costa-ECCTD05, author = "Eduardo da Costa and Paulo Flores and Jos{\'e} Monteiro", title = "Maximal Sharing of Partial Terms in {MCM} Under Minimal Signed Digit Representation", booktitle = ECCTD, volume = "II", pages = " 221--224 ", month = AUG # " 28--" # SEP # " 2, ", year = 2005, abstract = "We propose a new algorithm that maximizes the sharing of partial terms in Multiple Constant Multiplication (MCM) operations. MCM operations are required by many algorithms in digital signal processing and have been the subject of extensive research. Recently, the Minimal Signed Digit (MSD) number representation has been proposed as an extension to the Canonical Signed Digit (CSD) representation. By properly exploiting the redundancy of the MSD representation, the hardware implementation can be significantly optimized. The initial algorithm described in this paper is able to perform a better search for the optimal sharing of the redundant coefficient representations under MSD than previous methods. However, during its search the depth of adder-steps is not considered. We present a modified version of this algorithm that is able to reduce the maximum depth of partial terms at the expense of some extra hardware. The results show that for more complex problems our algorithm performs significantly better than previous approaches, in some cases obtaining solutions that require 25\% less hardware.", keywords = "Adders, Digital Arithmetic, Digital Signals, Multiplying Circuits.", isbn = "0-7803-9066-0", doi = "10.1109/ECCTD.2005.1523033", annote = "IF=, Citations(Scholar)=4, Citations(ISI)=, " } @InProceedings{Flores-ISCAS99, author = "Paulo Flores and Hor{\'a}cio Neto and Krishnendu Chakrabarty and Jo{\~a}o Marques-Silva", title = "Test pattern generation for width compression in {BIST}", booktitle = ISCAS, volume = 1, pages = " 114--118 ", location = "Orlando, FL, USA", month = MAY # " 30 -- " # JUN # " 2, ", year = 1999, abstract = "The main objectives of built-in self test (BIST) are the design of test pattern generator circuits which achieve the highest fault coverage, require the shortest sequence of test vectors and utilize the minimum circuit area. This paper targets the problem of generating test patterns for stuck-at faults that induce compatibility relations between the primary inputs of the circuit under test. These compatibility relations can be used for designing counter-based test generator circuits with a reduced number of bits, thus requiring smaller testing time and smaller area. The proposed solution is based on an integer linear programming (ILP) formulation that builds on existing propositional satisfiability (SAT) models for test pattern generation. An ATPG tool for minimum test pattern generation for width compression (MTP-C) is described, which illustrates the practical applicability of our approach for a wide range of benchmark circuits", keywords = "VLSI, Automatic Test Pattern Generation, Built-In Self Test, Fault Diagnosis, Integer Programming, Linear Programming, Logic Testing; BIST, Counter-Based Test Generator Circuits, Fault Coverage, Integer Linear Programming, Minimum Circuit Area, Minimum Test Pattern Generation, Propositional Satisfiability Models, Width Compression.", isbn = "0-7803-5471-0", doi = "10.1109/ISCAS.1999.777818" } @InProceedings{Flores-GLS99, author = "Paulo Flores and Hor{\'a}cio Neto and Jo{\~a}o Marques-Silva", title = "On Applying Set Covering Models to Test Set Compaction", booktitle = GLS, publisher = "IEEE Computer Society", pages = " 8--11 ", location = "Ypsilanti, MI", month = MAR # " 4--6, ", year = 1999, abstract = "Test set compaction is fundamental problem in digital system testing. In recent years, many competitive solutions have been proposed, most of which based on heuristics approaches. This paper studies the application of set covering models to the compaction of test sets, which can be used with any heuristic test set compaction procedure. For this purpose, recent and highly effective set covering algorithms are used. Experimental evidence suggests that the size of computed test sets can often be reduced by using set covering models and algorithms. Moreover a noteworthy empirical conclusion is that it may be preferable not to use fault simulation when the final objective is test set compaction", keywords = "Automatic Test Pattern Generation, Integrated Circuit Testing, Logic Testing, Minimisation.", doi = "10.1109/GLSV.1999.757365" } @InProceedings{Flores-VLSID99, author = "Paulo Flores and Jos{\'e} Costa and Hor{\'a}cio Neto and Jos{\'e} Monteiro and Jo{\~a}o Marques-Silva", title = "Assignment and Reordering of Incompletely Specified Pattern Sequences Targetting Minimum Power Dissipation", booktitle = "VLSID '99: Proceedings of the 12th International Conference on VLSI Design - 'VLSI for the Information Appliance'", publisher = "IEEE Computer Society", address = "Washington, DC, USA", pages = " 37--41 ", location = "Goa, India", month = JAN # " 10--13, ", year = 1999, isbn = "0-7695-0013-7" } @InProceedings{Flores-ICCD98, author = "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P. Marques-Silva", title = "An exact solution to the minimum size test pattern problem", booktitle = ICCD, pages = " 510--515 ", location = "Austin, TX", month = OCT # " 5--7, ", year = 1998, abstract = "This paper addresses the problem of test pattern generation for single stuck-at faults in combinational circuits, under the additional constraint that the number of specified primary input assignments is minimized. This problem has different applications in testing including the identification of don't care conditions to be used in the synthesis of Built-In Self- Test (BIST) logic. The proposed solution is based on an integer linear programming (ILP) formulation which builds on an existing propositional satisfiability (SAT) model for test pattern generation. The resulting ILP formulation is linear on the size of the original SAT model for test generation, which is linear on the size of the circuit. Nevertheless, the resulting ILP instances represent complex optimization problems, that require dedicated ILP algorithms. Preliminary results on benchmark circuits validate the practical applicability of the test pattern minimization model and associated ILP algorithm", keywords = "Automatic Test Pattern Generation, Built-In Self Test, Combinational Circuits, Computability, Integer Programming, Linear Programming, Logic Testing.", doi = "10.1109/ICCD.1998.727097" } @InProceedings{Flores-IWLS98-power, author = "Jos{\'e} C. Costa and Paulo F. Flores and Hor{\'a}cio C. Neto and Jos{\'e} C. Monteiro and Jo{\~a}o P. Marques-Silva", title = "Power Reduction in {BIST} by Exploiting Don't Cares in Test Patterns", booktitle = IWLS, pages = " 375--386 ", month = JUN, year = 1998 } @InProceedings{Flores-IWLS98-mtp, author = "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P. Marques-Silva", title = "An Exact Solution to the Minimum-Size Test Pattern Problem", booktitle = IWLS, pages = " 452--470 ", month = JUN, year = 1998 } @InProceedings{Flores-ETW98-power, author = "Jos{\'e} C. Costa and Paulo F. Flores and Hor{\'a}cio C. Neto and Jos{\'e} C. Monteiro and Jo{\~a}o P. Marques-Silva", title = "Exploiting Don't Cares in Test Patterns to Reduce Power During {BIST}", booktitle = ETW, pages = " 34--36 ", month = MAY, year = 1998 } @InProceedings{Flores-ETW98-mtp, author = "Paulo F. Flores and Hor{\'a}cio C. Neto and Krishnendu Chakrabarty and Jo{\~a}o P. Marques-Silva", title = "A Model and Algorithm for Computing Minimum-Size Test Patterns", booktitle = ETW, pages = " 147--148 ", month = MAY, year = 1998 } @InProceedings{Flores-CTC98-power, author = "Jos{\'e} C. Costa and Paulo F. Flores and Jos{\'e} C. Monteiro and Jo{\~a}o P. Marques-Silva", title = "Power Reduction in {BIST} by Exploiting Don't Cares in Test Patterns", booktitle = CTC, month = MAY, year = 1998 } @InProceedings{Flores-CTC98-mtp, author = "Paulo F. Flores and Hor{\'a}cio C. Neto and Krishnendu Chakrabarty and Jo{\~a}o P. Marques-Silva", title = "A Model and Algorithm for Computing Minimum-Size Test Patterns", booktitle = CTC, month = MAY, year = 1998 } @InProceedings{Manquinho-ICTAI97, author = "Vasco Manquinho and Paulo Flores and Jo{\~a}o Marques-Silva and Arlindo L. Oliveira", title = "Prime Implicant Computation Using Satisfiability Algorithms", booktitle = ICTAI, pages = " 232--239 ", location = "Newport Beach, CA", month = NOV # " 3--8, ", year = 1997, abstract = "The computation of prime implicants has several and significant applications in different areas, including automated reasoning, non-monotonic reasoning, electronic design automation, among others. The authors describe a new model and algorithm for computing minimum-size prime implicants of propositional formulas. The proposed approach is based on creating an integer linear program (ILP) formulation for computing the minimum-size prime implicant, which simplifies existing formulations. In addition, they introduce two new algorithms for solving ILPs, both of which are built on top of an algorithm for propositional satisfiability (SAT). Given the organization of the proposed SAT algorithm, the resulting ILP procedures implement powerful search pruning techniques, including a non-chronological backtracking search strategy, clause recording procedures and identification of necessary assignments. Experimental results, obtained on several benchmark examples, indicate that the proposed model and algorithms are significantly more efficient than other existing solutions", keywords = "Backtracking, Computability, Integer Programming, Linear Programming, Nonmonotonic Reasoning, Search Problems.", doi = "10.1109/TAI.1997.632261" } @InProceedings{Flores-DCIS96, author = "Paulo Flores and Hor{\'a}cio Neto", title = "Register Transfer Level {VHDL} Block Generation", booktitle = DCIS, month = NOV, year = 1996 }