%% Copyright (C) by Paulo Flores.
%%
%% Time-stamp: 2015-01-27 16:59:37    team.bib    pff(at)inesc-id.pt
%%
%% Desc: articles e InProceedings para oprojecto FCT: apporx
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% to generate a list of all bibliografy
% printbibkey /home/pff/lib/tex/bib/pff-author.bib
% mv test.bbl xx ; sed -e "s/\\LARGE/\\clearpage \\LARGE/g" xx> test.bbl

% to generate a hmtl file  see bbl2html.pl

% to check dup references
% grep eta ~/lib/tex/bib/pff.bib | cut -d\{ -f 2 |tr [A-Z] [a-z]|sort|uniq -c
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@String{PFF-AUTHOR = "Paulo Flores" }

@Article{Aksoy-TSP15-RefA,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Exact and Approximate Algorithms for the Filter Design
                  Optimization Problem",
  journal =      TSP,
  volume =       63,
  number =       1,
  pages =        " 142--154 ",
  month =        JAN,
  year =         2015,
  abstract =     "The filter design optimization (FDO) problem is For Revi
                  defined as finding a set of filter coefficients which
                  yields a filter design with minimum complexity, satisfying
                  the filter constraints. It has received a tremendous
                  interest due to the widespread application of filters.
                  Assuming that the coefficient multiplications in the filter
                  design are realized under a shift-adds architecture, the
                  complexity is generally defined in terms of the total
                  number of adders and subtractors. In this article, we
                  present an exact FDO algorithm that can guarantee the
                  minimum design complexity under the minimum quantization
                  value, but can only be applied to filters with a small
                  number of coefficients. We also introduce an approximate
                  algorithm that can handle filters with a large number of
                  coefficients using less computational resources than the
                  exact FDO algorithm and find better solutions than existing
                  FDO heuristics. We describe how these algorithms can be
                  modified to handle a delay constraint in the shift-adds
                  designs of the multiplier blocks and to target different
                  filter constraints and filter forms. Experimental results
                  show the effectiveness of the proposed algorithms with
                  respect to prominent FDO algorithms and explore the impact
                  of design parameters, such as the filter length,
                  quantization value, and filter form, on the complexity and
                  performance of filter designs.",
  keywords =     "Finite Impulse Response (FIR) filters; Direct and trans
                  posed forms; Filter design optimization problem;
                  Multiplierless design; Depth-first and local search
                  methods; Delay reduction; Filter design and structures; DSP
                  Algorithm implementation in hardware and software.",
  issn =         "1053-587X",
  doi =          "10.1109/TSP.2014.2366713"
}

@Article{Aksoy-TODAES14-RefB,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Multiplierless Design of Folded {DSP} Blocks",
  journal =      TODAES,
  volume =       20,
  number =       1,
  pages =        " 14:1--14:24 ",
  month =        NOV,
  year =         2014,
  abstract =     "This article addresses the problem of optimizing the
                  implementation cost of the time-multiplexed constant
                  multiplication (TMCM) operation which realizes the
                  multiplication of an input variable by a single constant
                  selected from a set of multiple constants at a time. It
                  presents an efficient algorithm, called ORPHEUS, which
                  finds a multiplierless TMCM design by sharing logic
                  operators, i.e., adders, subtractors, adders/subtractors,
                  and multiplexors (MUXes). Moreover, this article introduces
                  folded design architectures for the digital signal
                  processing (DSP) blocks, such as finite impulse response
                  (FIR) filters and linear DSP transforms, and describes how
                  these folded DSP blocks can be efficiently realized using
                  TMCM operations optimized by ORPHEUS. Experimental results
                  indicate that ORPHEUS can find better solutions than
                  existing TMCM algorithms, yielding TMCM designs requiring
                  less area. They also show that the folded architectures
                  lead to alternative designs with significantly less area,
                  but incurring an increase in latency and energy
                  consumption, compared to the parallel architecture.",
  keywords =     "Time-multiplexed constant multiplication; Folded
                  architecture, multiplierless design; Area optimization;
                  Finite Impulse Response (FIR) filter, Discrete Cosine
                  Transform (DCT); Integer Cosine Transform (ICT); Folded
                  architectures.",
  issn =         "1084-4309",
  doi =          "10.1145/2663343"
}

@Article{Neves-TVLSI14,
  author =       "Nuno Neves and Nuno Sebastião and David Matos and Pedro
                  Tomás and Paulo Flores and Nuno Roma",
  title =        "Multicore {SIMD} {ASIP} for Next-Generation Sequencing and
                  Alignment Biochip Platforms",
  journal =      TVLSI,
  pages =        " ??--?? ",
  month =        JUL # " 18 ",
  year =         2014,
  abstract =     "Targeting the development of new biochip platforms capable
                  of autonomously sequencing and aligning biological
                  sequences, a new multicore processing structure is proposed
                  in this manuscript. This multicore structure makes use of a
                  shared memory model and multiple instantiations of a novel
                  application-specific instruction-set processor (ASIP) to
                  simultaneously exploit both fine and coarse-grained
                  parallelism and to achieve high performance levels at
                  low-power consumption. The proposed ASIP is built by
                  extending the instruction set architecture of a
                  synthesizable processor, including both general and
                  special-purpose single-instruction multiple-data
                  instructions. This allows an efficient exploitation of
                  fine-grained parallelism on the alignment of biological
                  sequences, achieving over 30x speedup when compared with
                  sequential algorithmic implementations. The complete system
                  was prototyped on different field-programmable gate array
                  platforms and synthesized with a 90-nm CMOS process
                  technology. Experimental results demonstrate that the
                  multicore structure scales almost linearly with the number
                  of instantiated cores, achieving performances similar to a
                  quad-core Intel Core i7 3820 processor, while using 25x
                  less energy.",
  keywords =     "Application Specific Instruction-Set Architecture;
                  Single-Instruction Multiple-Data; Multi-Core Architecture,
                  Biological Sequences Alignment; Biochip Platforms.",
  issn =         "1063-8210",
  doi =          "10.1109/TVLSI.2014.2333757",
  note =         ONLINE*
}

@Article{Aksoy-JCSSP14,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "A Tutorial on Multiplierless Design of {FIR} Filters:
                  Algorithms and Architectures",
  journal =      JCSSP,
  volume =       33,
  number =       6,
  pages =        " 1689--1719 ",
  location =     "Springer US",
  month =        JUN,
  year =         2014,
  abstract =     "Finite impulse response (FIR) filtering is a ubiquitous
                  operation in digital signal processing systems and is
                  generally implemented in full custom circuits due to
                  high-speed and low-power design requirements. The
                  complexity of an FIR filter is dominated by the
                  multiplication of a large number of filter coefficients by
                  the filter input or its time-shifted versions. Over the
                  years, many high-level synthesis algorithms and filter
                  architectures have been introduced in order to design FIR
                  filters efficiently. This article reviews how constant
                  multiplications can be designed using shifts and
                  adders/subtractors that are maximally shared through a
                  high-level synthesis algorithm based on some optimization
                  criteria. It also presents different forms of FIR filters,
                  namely, direct, transposed, and hybrid and shows how
                  constant multiplications in each filter form can be
                  realized under a shift-adds architecture. More importantly,
                  it explores the impact of the multiplierless realization of
                  each filter form on area, delay, and power dissipation of
                  both custom (ASIC) and reconfigurable (FPGA) circuits by
                  carrying out experiments with different bitwidths of filter
                  input, design libraries, reconfigurable target devices, and
                  optimization criteria in high-level synthesis algorithms.",
  keywords =     "FIR filter;Direct, transposed, and hybrid forms;
                  Multiplierless design; High-level synthesis; Area and delay
                  optimization; Custom and reconfigurable circuits.",
  issn =         "0278-081X (Print) 1531-5878 (Online)",
  doi =          "10.1007/s00034-013-9727-8"
}

@Article{Brito-TVLSI14,
  author =       "Diogo Brito and Taimur Rabuske and Jorge Fernandes and
                  Paulo Flores and José Monteiro",
  title =        "Quaternary Logic Look-up Table in Standard {CMOS}",
  journal =      TVLSI,
  pages =        " ??--?? ",
  location =     "IEEE",
  month =        MAR # ", 19",
  year =         2014,
  abstract =     "Interconnections are increasingly the dominant contributor
                  to delay, area and energy consumption in CMOS digital
                  circuits. Multiple-valued logic can decrease the average
                  power required for level transitions and reduces the number
                  of required interconnections, hence also reducing the
                  impact of interconnections on overall energy consumption.
                  In this paper, we propose a quaternary lookup table (LUT)
                  structure, designed to replace or complement binary LUTs in
                  field programmable gate arrays. The circuit is compatible
                  with standard CMOS processes, with a single voltage supply
                  and employing only simple voltage-mode structures. A clock
                  boosting technique is used to optimize the switches
                  resistance and power consumption. The proposed
                  implementation overcomes several limitations found in
                  previous quaternary implementations published so far, such
                  as the need for special features in the CMOS process or
                  power-hungry current-mode cells. We present a full adder
                  prototype based on the designed LUT, fabricated in a
                  standard 130-nm CMOS technology, able to work at 100 MHz
                  while consuming 122 W. The experimental results demonstrate
                  the correct quaternary operation and confirm the power
                  efficiency of the proposed design.",
  keywords =     "Field-Programmable Gate Array (FPGA); Lookup Table (LUT);
                  Multiple-Valued Logic (MVL); Quaternary Logic; Standard
                  CMOS Technology.",
  issn =         "1063-8210",
  doi =          "10.1109/TVLSI.2014.2308302",
  note =         ONLINE*
}

@Article{Sebastiao-CCPE13,
  author =       "Nuno Sebastião and Nuno Roma and Paulo Flores",
  title =        "Configurable and scalable class of high performance
                  hardware accelerators for simultaneous {DNA} sequence
                  alignment",
  journal =      CCPE,
  volume =       25,
  number =       10,
  pages =        " 1319--1339 ",
  month =        JUL,
  year =         2013,
  abstract =     "A new class of efficient and flexible hardware accelerators
                  for DNA local sequence alignment based on the widely used
                  Smith-Waterman algorithm is proposed in this paper. This
                  new class of accelerating structures exploits an innovative
                  technique that tracks the origin coordinates of the best
                  alignment to allow a significant reduction of the size of
                  the dynamic programming matrix that needs to be recomputed
                  during the subsequent traceback phase, providing a
                  considerable reduction of the resulting time and memory
                  requirements. The significant performance of the enhanced
                  class of accelerators is attained by also providing support
                  for an additional level of parallelism: the capability to
                  concurrently align several query sequences with one or more
                  reference sequences, according to the specific application
                  requisites. Moreover, the accelerator class also includes
                  specially designed processing elements that improve the
                  resource usage when implemented in a Field Programmable
                  Gate Array (FPGA), and easily provide several different
                  configurations in an Application Specific Integrated
                  Circuit (ASIC) implementation. Obtained results
                  demonstrated that speedups as high as 278 can be obtained
                  in ASIC accelerating structures. A FPGA-based prototyping
                  platform, operating at a 40 times lower clock frequency and
                  incorporating a complete alignment embedded system, still
                  provides significant speedups as high as 27, compared with
                  a pure software implementation.",
  keywords =     "Hardware accelerator; DNA; Local Sequence Alignment;
                  Traceback; FPGA; ASIC.",
  issn =         "1532-0634",
  doi =          "10.1002/cpe.2934",
  note =         "Special Issue: High performance computing and simulation:
                  architectures, systems, algorithms, technologies, services,
                  and applications"
}

@Article{Aksoy-TVLSI13-RefC,
  author =       "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and
                  Paulo Flores and José Monteiro",
  title =        "Design of Digit-Serial {FIR} Filters: Algorithms,
                  Architectures, and a {CAD} Tool",
  journal =      TVLSI,
  volume =       21,
  number =       3,
  pages =        " 498--511 ",
  month =        MAR,
  year =         2013,
  abstract =     "In the last two decades, many efficient algorithms and
                  architectures have been introduced for the design of
                  low-complexity bit-parallel multiple constant
                  multiplications (MCM) operation which dominates the
                  complexity of many digital signal processing systems. On
                  the other hand, little attention has been given to the
                  digit-serial MCM design that offers alternative
                  low-complexity MCM operations albeit at the cost of an
                  increased delay. In this paper, we address the problem of
                  optimizing the gate-level area in digit-serial MCM designs
                  and introduce high-level synthesis algorithms, design
                  architectures, and a computer-aided design tool.
                  Experimental results show the efficiency of the proposed
                  optimization algorithms and of the digit-serial MCM
                  architectures in the design of digit-serial MCM operations
                  and finite impulse response filters.",
  keywords =     "0-1 Integer Linear Programming (ILP); Digit-Serial
                  Arithmetic; Finite Impulse Response (FIR) Filters;
                  Gate-Level Area Optimization; Multiple Constant
                  Multiplications.",
  issn =         "1063-8210",
  doi =          "10.1109/TVLSI.2012.2188917",
  history =      "Accepted and re-submitted on 2011-11-17"
}

@Article{Sebastiao-TVLSI12,
  author =       "Nuno Sebastião and Nuno Roma and Paulo Flores",
  title =        "Integrated Hardware Architecture for Efficient Computation
                  of the n-Best Bio-Sequence Local Alignments in Embedded
                  Platforms",
  journal =      TVLSI,
  volume =       20,
  number =       7,
  pages =        " 1262 -- 1275 ",
  month =        JUL,
  year =         2012,
  abstract =     "A flexible hardware architecture that implements a set of
                  new and efficient techniques to significantly reduce the
                  computational requirements of the commonly used
                  Smith-Waterman sequence alignment algorithm is presented.
                  Such innovative techniques use information gathered by the
                  hardware accelerator during the computation of the
                  alignment scores to constrain the size of the subsequence
                  that has to be post-processed in the traceback phase using
                  a general purpose processor (GPP). Moreover, the proposed
                  structure is also capable of computing the n-best local
                  alignments according to the Waterman-Eggert algorithm,
                  becoming the first hardware architecture that is able to
                  simultaneously evaluate the n-best alignments of a given
                  sequence pair, by incorporating a set of ordering units
                  that work in parallel with the systolic array. A complete
                  alignment system was developed and implemented in a
                  Virtex-4 FPGA, by integrating the proposed accelerator
                  architecture with a Leon3 GPP. The obtained experimental
                  results demonstrate that the proposed systm is flexible and
                  allows the alignment of large sequences in memory
                  constrained systems. As an example, a speedup of 17 was
                  obtained with the conceived system when compared to a
                  regular implementation of the LALIGN35 program running on
                  an Intel Core2 Duo processor running at a 40 times higher
                  frequency.",
  keywords =     "DNA sequence alignment; Smith-Waterman (S-W);
                  Waterman-Eggert (W-E); Hardware accelerator.",
  issn =         "1063-8210",
  doi =          "10.1109/TVLSI.2011.2157541"
}

@Article{Aksoy-INTEGRATION12,
  author =       "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and
                  Paulo Flores and José Monteiro",
  title =        "High-level algorithms for the optimization of gate-level
                  area in digit-serial multiple constant multiplications",
  journal =      INTEGRATION,
  volume =       45,
  number =       3,
  pages =        " 294--306 ",
  month =        JUN,
  year =         2012,
  abstract =     "The last two decades have seen tremendous effort on the
                  development of high-level synthesis algorithms for
                  efficient realization of the multiplication of a variable
                  by a set of constants using only addition, subtraction, and
                  shift operations. These algorithms generally target the
                  minimization of the number of adders and subtractors,
                  assuming that shifts are realized using only wires due to
                  the bit-parallel processing of the input data. On the other
                  hand, digit-serial architectures offer alternative
                  low-complexity designs since digit-serial operators occupy
                  less area and are independent of the data wordlength.
                  However, in this case, shifts are no longer free in terms
                  of hardware and require D flip-flops. Moreover, each
                  digit-serial addition, subtraction, and shift operation has
                  different implementation cost at gate-level. Hence, this
                  article introduces high-level algorithms that optimize the
                  area of digit-serial constant multiplications under the
                  shift-adds architecture by taking into account the
                  implementation cost of each operation at gate-level.
                  Experimental results indicate that our high-level
                  algorithms obtain better solutions than prominent
                  algorithms designed for the minimization of the number of
                  operations in terms of gate-level area and their solutions
                  lead to less complex digit-serial MCM designs. It is also
                  shown that the use of shift-adds architecture yields
                  significant area reductions when compared to the constant
                  multiplications designed using generic digit-serial
                  constant multipliers.",
  keywords =     "Multiple constant multiplications; Digit-serial
                  architectures; Gate-level area optimization; 0-1 Integer
                  linear programming; Graph-based algorithms.",
  issn =         "0167-9260",
  doi =          "10.1016/j.vlsi.2011.11.008",
  note =         "Special Issue of GLSVLSI 2011: Current Trends on VLSI and
                  Ultra Low-Power Design"
}

@Article{Sebastiao-MICPRO12,
  author =       "Nuno Sebastião and Nuno Roma and Paulo Flores",
  title =        "Hardware accelerator architecture for simultaneous
                  short-read {DNA} sequences alignment with enhanced
                  traceback phase",
  journal =      MICPRO,
  publisher =    "Elsevier Science Publishers B. V.",
  volume =       36,
  number =       2,
  pages =        " 96--109 ",
  month =        MAR,
  year =         2012,
  abstract =     "Dynamic programming algorithms are widely used to find the
                  optimal sequence alignment between any two DNA sequences.
                  This manuscript presents a new, flexible and scalable
                  hardware accelerator architecture to speedup the
                  implementation of the frequently used Smith-Waterman
                  algorithm. When integrated with a general purpose
                  processor, the developed accelerator significantly reduces
                  the computation time and memory space requirements of
                  alignment tasks. Such efficiency mainly comes from two
                  innovative techniques that are proposed. First, the usage
                  of the maximum score cell coordinates, gathered during the
                  computation of the alignment scores in the matrix-fill
                  phase, in order to significantly reduce the time and memory
                  requirements of the traceback phase. Second, the
                  exploitation of an additional level of parallelism in order
                  to simultaneously align several query sequences with the
                  same reference sequence, targeting the processing of
                  short-read DNA sequences. The results obtained from the
                  implementation of a complete alignment system based on the
                  new accelerator architecture in a Virtex-4 FPGA showed that
                  the proposed techniques are feasible and the developed
                  accelerator is able to provide speedups as high as 16 for
                  the considered test sequences. Moreover, it was also shown
                  that the proposed approach allows the processing of larger
                  DNA sequences in memory restricted environments.",
  keywords =     "DNA; Hardware accelerator; Local sequence alignment;
                  Traceback.",
  issn =         "0141-9331",
  doi =          "10.1016/j.micpro.2011.05.003",
  note =         "Special Issue - Exploitation Of Hardware Accelerators"
}

@Article{Aksoy-TODAES12,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Optimization Algorithms for the Multiplierless Realization
                  of Linear Transforms",
  journal =      TODAES,
  volume =       17,
  number =       1,
  pages =        " 3:1 -- 3:27 ",
  month =        JAN,
  year =         2012,
  abstract =     "This article addresses the problem of finding the fewest
                  number of addition and subtraction operations in the
                  multiplication of a constant matrix with an input vector, a
                  fundamental operation in many linear digital signal
                  processing transforms. We first introduce an exact Common
                  Subexpression Elimination (CSE) algorithm that formalizes
                  the minimization of the number of operations as a 0-1
                  integer linear programming problem. Since there are still
                  instances that the proposed exact algorithm cannot handle
                  due to the NP-completeness of the problem, we also
                  introduce a CSE heuristic algorithm that iteratively finds
                  the most common 2-term subexpressions with the minimum
                  conflicts among the expressions. Furthermore, since the
                  main drawback of CSE algorithms is their dependency on a
                  particular number representation, we propose a hybrid
                  algorithm that initially finds promising realizations of
                  linear transforms using a numerical difference method, and
                  then applies the proposed CSE algorithm to utilize the
                  common subexpressions iteratively. The experimental results
                  on a comprehensive set of instances indicate that the
                  proposed approximate algorithms find competitive results
                  with those of the exact CSE algorithm and obtain better
                  solutions than prominent previously proposed heuristics. It
                  is also observed that our solutions yield significant area
                  reductions in the design of linear transforms after circuit
                  synthesis compared to direct realizations of linear
                  transforms.",
  keywords =     "0-1 Integer Linear Programming; Constant Matrix-Vector
                  Multiplication; Common Subexpression Elimination; Numerical
                  Difference Method.",
  issn =         "1084-4309",
  doi =          "10.1145/2071356.2071359"
}

@Article{Aksoy-MICPRO11,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Finding the Optimal Tradeoff Between Area and Delay in
                  Multiple Constant Multiplications",
  journal =      MICPRO,
  volume =       35,
  number =       8,
  pages =        " 729--741 ",
  month =        NOV,
  year =         2011,
  abstract =     "Over the years many efficient algorithms for the
                  multiplierless design of multiple constant multiplications
                  (MCM) have been introduced. These algorithms primarily
                  focus on finding the fewest number of addition/subtraction
                  operations that generate the MCM. Although the complexity
                  of an MCM design is decreased by reducing the number of
                  operations, their solutions may not lead to an MCM design
                  with optimal area at gate-level since they do not consider
                  the implementation costs of the operations in hardware.
                  This article introduces two approximate algorithms that aim
                  to optimize the area of the MCM operation by taking into
                  account the gate-level implementation of each addition and
                  subtraction operation which realizes a constant
                  multiplication. To find the optimal tradeoff between area
                  and delay, the proposed algorithms are further extended to
                  find an MCM design with optimal area under a delay
                  constraint. Experimental results clearly indicate that the
                  solutions of the proposed algorithms lead to significantly
                  better MCM designs at gate-level when compared to those
                  obtained by the solutions of algorithms designed for the
                  optimization of the number of operations.",
  keywords =     "Multiple constant multiplications; Addition and subtraction
                  architectures; Gate-level area optimization; Delay aware
                  area optimization; 0-1 integer linear programming;
                  Graph-based algorithms.",
  issn =         "0141-9331",
  doi =          "10.1016/j.micpro.2011.08.009"
}

@Article{Lazzari-JOLPE11,
  author =       "Cristiano Lazzari and Jorge Fernandes and Paulo Flores and
                  José Monteiro",
  title =        "Low Power Multiple-value Voltage-mode Look-up Table for
                  Quaternary Field Programmable Gate Arrays",
  journal =      JOLPE,
  volume =       7,
  number =       2,
  pages =        " 294--301 ",
  month =        APR,
  year =         2011,
  abstract =     "FPGA structures are widely used as they enable early
                  time-to-market and reduced non-recurring engineering costs
                  in comparison to ASIC designs. Interconnections play a
                  crucial role in modern FPGAs, because they dominate delay,
                  power and area. Multiple-valued logic allows the reduction
                  of the number of interconnections in the circuit, hence can
                  serve as a mean to effectively curtail the impact of
                  interconnections. In this work we propose a new look-up
                  table structure based on a low-power high-speed quaternary
                  voltage-mode device. The most important characteristics of
                  the proposed architecture are that it is a voltage-mode
                  structure, which allows reduced power consumption, and it
                  is implemented using a standard CMOS technology. Our
                  quaternary implementation overcomes previous proposed
                  techniques with simple and efficient CMOS structures.
                  Moreover, results show significant reductions on power
                  consumption and timing in comparison to binary
                  implementations with similar functionality.",
  keywords =     "Multiple-value Logic; Quaternary Logic; Look-up Tables;
                  FPGAs; Standard CMOS Technology.",
  issn =         "1546-1998",
  doi =          "10.1166/jolpe.2011.1138"
}

@Article{Aksoy-MICPRO10-RefE,
  author =       "Levent Aksoy and Ece Olcay Gunes and Paulo Flores",
  title =        "Search algorithms for the multiple constant multiplications
                  problem: Exact and approximate",
  journal =      MICPRO,
  volume =       34,
  number =       5,
  pages =        " 151--162 ",
  month =        AUG,
  year =         2010,
  abstract =     "This article addresses the multiplication of one data
                  sample with multiple constants using addition/subtraction
                  and shift operations, i.e., the multiple constant
                  multiplications (MCM) operation. In the last two decades,
                  many efficient algorithms have been proposed to implement
                  the MCM operation using the fewest number of addition and
                  subtraction operations. However, due to the NP-hardness of
                  the problem, almost all the existing algorithms have been
                  heuristics. The main contribution of this article is the
                  proposal of an exact depth-first search algorithm that,
                  using lower and upper bound values of the search space for
                  the MCM problem instance, finds the minimum solution
                  consuming less computational resources than the previously
                  proposed exact breadth-first search algorithm. We start by
                  describing the exact breadth-first search algorithm that
                  can be applied on real mid-size instances. We also present
                  our recently proposed approximate algorithm that finds
                  solutions close to the minimum and is able to compute
                  better bounds for the MCM problem. The experimental results
                  clearly indicate that the exact depth-first search
                  algorithm can be efficiently applied to large size hard
                  instances that the exact breadth-first search algorithm
                  cannot handle and the heuristics can only find suboptimal
                  solutions.",
  keywords =     "Multiple constant multiplications problem; Depth-first
                  search; Breadth-first search; Graph-based algorithms;
                  Finite Impulse Response filters.",
  issn =         "0141-9331",
  doi =          "10.1016/j.micpro.2009.10.001",
  note =         "Special issue on selected papers from NORCHIP 2008."
}

@Article{Morgado-TODAES09,
  author =       "P. Marques Morgado and Paulo F. Flores and L. Miguel
                  Silveira",
  title =        "Generating Realistic Stimuli for Accurate Power Grid
                  Analysis",
  journal =      TODAES,
  volume =       14,
  number =       3,
  pages =        " 40:1--40:26 ",
  month =        MAY,
  year =         2009,
  abstract =     "Power Analysis tools are an integral component of any
                  current power sign-off methodology. The performance of a
                  design's power grid affects the timing and functionality of
                  a circuit directly impacting the overall performance.
                  Ensuring power grid robustness implies taking into account,
                  among others, static and dynamic effects of voltage drop,
                  ground bounce and electromigration. This type of
                  verification is usually done by simulation, targeting a
                  worst-case scenario where devices, switching almost
                  simultaneously, could impose stern current demands on the
                  power grid. While determination of the exact worst-case
                  switching conditions from the grid perspective is usually
                  not practical, the choice of simulation stimuli has a
                  critical effect on the results of the analysis. Targetting
                  safe but unrealistic settings could lead to pessimistic
                  results and costly over-designs in terms of die area. In
                  this paper we describe a software tool that generates a
                  reasonable, realistic, set of stimuli for simulation. The
                  approach proposed accounts for timing and spatial
                  restrictions that arise from the circuit's netlist and
                  placement and generates an approximation to the worst-case
                  condition. The resulting stimuli indicates that only a
                  fraction of the gates change in any given timing window,
                  leading to a more robust verification methodology,
                  specially in the dynamic case. Generating such stimuli is
                  akin to performing a standard static timing analysis, so
                  the tool fits well within conventional design frameworks.
                  Furthermore, the tool can be used for hot-spot detection in
                  early design stages.",
  keywords =     "Power grid; Verification; Simulation; Voltage drop; Ground
                  bounce; Stimuli Generationy.",
  issn =         "1084-4309",
  doi =          "10.1145/1529255.1529262",
  note =         "Special issue on Demonstrable Sfotware Systems and Hardware
                  Platforms II"
}

@Article{Aksoy-TCADICS08,
  author =       "Levent Aksoy and Eduardo da Costa and Paulo Flores and
                  Jos{\'e} Monteiro",
  title =        "Exact and Approximate Algorithms for the Optimization of
                  Area and Delay in Multiple Constant Multiplications",
  journal =      TCADICS,
  volume =       27,
  number =       6,
  pages =        " 1013--1026 ",
  month =        JUN,
  year =         2008,
  abstract =     "The main contribution of this paper is an exact common
                  subexpression elimination algorithm for the optimum sharing
                  of partial terms in multiple constant multiplications
                  (MCMs). We model this problem as a Boolean network that
                  covers all possible partial terms that may be used to
                  generate the set of coefficients in the MCM instance. We
                  cast this problem into a 0{\'o}1 integer linear programming
                  (ILP) problem by requiring that the single output of this
                  network is asserted while minimizing the number of gates
                  representing operations in the MCM implementation that
                  evaluate to one. A satisfiability (SAT)-based 0{\'o}1 ILP
                  solver is used to obtain the exact solution. We argue that
                  for many real problems, the size of the problem is within
                  the capabilities of current SAT solvers. Because
                  performance is often a primary design parameter, we
                  describe how this algorithm can be modified to target the
                  minimum area solution under a user-specified delay
                  constraint. Additionally, we propose an approximate
                  algorithm based on the exact approach with extremely
                  competitive results. We have applied these algorithms on
                  the design of digital filters and present a comprehensive
                  set of results that evaluate ours and existing
                  approximation schemes against exact solutions under
                  different number representations and using different SAT
                  solvers.",
  keywords =     "Canonical signed digit (CSD); common subexpression
                  elimination (CSE); delay constraints; minimal signed digit
                  (MSD); multiple constant multiplications (MCMs); pseudo-
                  Boolean optimization (PBO).",
  issn =         "0278-0070",
  doi =          "10.1109/TCAD.2008.923242"
}

@Article{Gil-JSAT08,
  author =       "Lu{\'\i}s Gil and Paulo Flores and Lu{\'\i}s Miguel
                  Silveira",
  title =        "{PMSat}: a parallel version of {MiniSAT}",
  journal =      JSAT,
  volume =       6,
  pages =        " 71--98 ",
  year =         2008,
  abstract =     "Parallel computing has become an affordable reality forcing
                  a shift in the programming paradigm from sequential to
                  concurrent applications, specially those who demand much
                  computational power or with large search spaces like
                  SAT-solvers. In this context we present the research,
                  planning and implementation of PMSat: a parallel version of
                  MiniSAT with MPI (Message Passing Interface) technology, to
                  be executed in clusters or grids of computers. The main
                  features of the program are described: search modes,
                  assumptions pruning and share of learnt clauses. An
                  analysis of its performance and load balance is also
                  presented.",
  keywords =     "Parallel Computing; SAT-solver; Satisfiability; Message
                  Passing Interface.",
  issn =         "1574-0617",
  annote =       "Submitted, week-reject 2008-05-17, resubmited 2008-08-27",
  history =      "Submitted, week-reject 2008-05-17, resubmited 2008-08-27"
}

@Article{Flores-TODAES01,
  author =       "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P.
                  Marques-Silva",
  title =        "An exact solution to the minimum size test pattern problem",
  journal =      TODAES,
  publisher =    "ACM Press",
  address =      "New York, NY, USA",
  volume =       6,
  number =       4,
  pages =        " 629--644 ",
  month =        OCT,
  year =         2001,
  abstract =     "This article addresses the problem of test pattern
                  generation for single stuck-at faults in combinational
                  circuits, under the additional constraint that the number
                  of specified primary input assignments is minimized. This
                  problem has different applications in testing, including
                  the identification of ``don't care'' conditions to be used
                  in the synthesis of Built-In Self-Test (BIST) logic. The
                  proposed solution is based on an integer linear programming
                  (ILP) formulation which builds on an existing Propositional
                  Satisfiability (SAT) model for test pattern generation. The
                  resulting ILP formulation is linear on the size of the
                  original SAT model for test generation, which is linear on
                  the size of the circuit. Nevertheless, the resulting ILP
                  instances represent complex optimization problems, that
                  require dedicated ILP algorithms. Preliminary results on
                  benchmark circuits validate the practical applicability of
                  the test pattern minimization model and associated ILP
                  algorithm.",
  keywords =     "Automatic Test Pattern Generation (ATPG); Built-In
                  Self-Test (BIST); Integer Linear Programming (ILP);
                  propositional satisfiability (SAT); verification and test.",
  issn =         "1084-4309",
  doi =          "http://doi.acm.org/10.1145/502175.502186"
}

@Article{Sousa-Ingenium92,
  author =       "Ana Sousa and Jos{\'e} Abreu and Paulo Flores",
  title =        "{VHDL} novo suporte às metodologias de {CAD}",
  journal =      "Ingenium. Revista da Ordem dos Engenheiros)",
  pages =        " 49--55 ",
  month =        APR,
  year =         1992,
  keywords =     " "
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@String{PFF-AUTHOR = "Paulo Flores" }

@InProceedings{Aksoy-ISCAS15,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Approximation of Multiple Constant Multiplications Using
                  Minimum Look-Up Tables on {FPGA}",
  booktitle =    ISCAS,
  pages =        " ??--?? ",
  location =     "Lisbon, Portugal",
  month =        MAY # " 24--27, ",
  year =         2015,
  abstract =     "In many digital signal processing (DSP) systems,
                  computations can be carried out within a tolerable error
                  range rather than finding the exact output, enabling
                  significant reductions in area, delay, or power dissipation
                  of the design. This paper addresses the problem of
                  approximating the multiple constant multiplications (MCM)
                  operation which occurs frequently in DSP applications. We
                  consider the realization of constant multiplications using
                  look-up tables (LUTs) on field programmable gate arrays
                  (FPGA) and introduce an exact algorithm, called THETIS,
                  that can find a minimum number of distinct LUTs required to
                  realize the partial products of constant multiplications,
                  satisfying an error constraint. Experimental results show
                  that THETIS can achieve significant reductions in number of
                  LUTs on MCM instances and its solutions lead to less
                  complex filter designs on FPGA than those realized using
                  original filter coefficients. ",
  keywords =     "Aproximating computing, multiple constant multiplications
                  (MCM), FPGA",
  note =         ACCEPTED*,
  annote =       "Poster"
}

@InProceedings{Aksoy-ICCD14,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Efficient Design of {FIR} Filters Using Hybrid Multiple
                  Constant Multiplications on {FPGA}",
  booktitle =    ICCD,
  publisher =    "IEEE",
  pages =        " 42--47 ",
  location =     "Seoul, Korea",
  month =        OCT # " 19--22, ",
  year =         2014,
  abstract =     "The multiple constant multiplication (MCM) block, that
                  realizes the multiplication of constants by a variable, is
                  a ubiquitous operation in digital signal processing (DSP)
                  systems. It can be implemented using generic multipliers or
                  shifts and adders/subtractors. This paper addresses the
                  problem of finding the minimum number of adders/subtractors
                  to realize the MCM block while a number of multipliers are
                  available to realize some constant multiplications. Such a
                  situation appears in the design of DSP systems on field
                  programmable gate arrays (FPGAs) which also include generic
                  multipliers. We present a 0-1 integer linear programming
                  (ILP) formulation of this problem, yielding an exact common
                  subexpression elimination (CSE) method. Due to the
                  NP-completeness of this problem, we also introduce an
                  approximate graph-based (GB) algorithm. Experimental
                  results show that the proposed methods can find better
                  solutions than a state-of-art algorithm and the use of
                  different numbers of multipliers in the MCM block leads to
                  filter designs with different number of slices, delay, and
                  power dissipation which enable a designer to choose the one
                  that fits best in an application.",
  keywords =     "Multiple Constant Multiplication (MCM); FPGAs; Exact Common
                  Subexpression elimination (CSE) method; Approximate
                  Graph-Based (GB) algorithm; Integer Linear Programming
                  (ILP).",
  doi =          "10.1109/ICCD.2014.6974660"
}

@InProceedings{Sebastiao-HPCS14,
  author =       "Nuno Sebastião and Paulo Flores and Nuno Roma",
  title =        "Optimized {ASIP} Architecture for Compressed {BWT}-Indexed
                  Search in Bioinformatics Applications",
  booktitle =    HPCS,
  publisher =    "IEEE",
  pages =        " 527--534 ",
  location =     "Bologna, Italy",
  month =        JUL # " 21--25, ",
  year =         2014,
  abstract =     "Compressed indexes are adopted by a vast set of
                  bioinformatics applications that deal with extremely large
                  datasets, mainly due to the inherently high memory
                  requirements of uncompressed alternatives. However, the
                  additional computational overhead that is imposed by the
                  usage of such indexes makes them harder to implement in
                  embedded computational platforms, such as biochips, with
                  strict processing and power restrictions. Furthermore,
                  compressed indexes are often characterized by a significant
                  usage of bit-level operations, some of which are not
                  commonly available on General Purpose Processors (GPPs). To
                  circumvent this limitation, an Application-Specific
                  Instruction-set Processor (ASIP) architecture is proposed
                  to accelerate the processing of biological sequences (e.g.,
                  alignment, mapping, etc.) using compressed full-text
                  indexes based on the Burrows-Wheeler Transform (BWT). The
                  proposed processor was built over a RISC micro-architecture
                  and extends the Xilinx MicroBlaze ISA with additional
                  bit-level operations, especially tailored for compressed
                  indexes. When used to perform search operations over the
                  considered compressed index, the proposed architecture
                  provides a reduction of the number of required instructions
                  by about one half. Furthermore, when prototyped on a Xilinx
                  Virtex-7 FPGA, the ASIP proved to offer an overall speedup
                  between 3.1x and 4.5x for the execution of a single
                  threaded operation. To ensure a further processing
                  scalability, the proposed ASIP was designed in order to be
                  easily used as the basic processing unit of multi-core
                  systems, especially tuned for the parallel processing of
                  massive datasets of biological reads.",
  keywords =     "Bioinformatics applications; Embedded computational
                  platforms; Processing of biological sequences: alignment,
                  mapping; Application-Specific Instruction-set Processor
                  (ASIP); Compressed full-text indexes; Burrows-Wheeler
                  Transform (BWT); MicroBlaze ISA extension.",
  isbn =         "978-1-4799-5312-7",
  doi =          "10.1109/HPCSim.2014.6903731"
}

@InProceedings{Aksoy-ISCAS14,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "{ECHO}: A Novel Method for the Multiplierless Design of
                  Constant Array Vector Multiplication",
  booktitle =    ISCAS,
  publisher =    "IEEE",
  pages =        " 1456-1459 ",
  location =     "Melbourne, Australia",
  month =        JUN # " 1--5, ",
  year =         2014,
  abstract =     "The constant array vector multiplication (CAVM) operation
                  realizes the multiplication of a constant array by a vector
                  of variables and occurs in the design of direct form finite
                  impulse response (FIR) and infinite impulse response (IIR)
                  filters. In this paper, for the first time, we directly
                  target the optimization of the multiplierless design of a
                  CAVM operation and introduce a novel algorithm, called
                  ECHO, that can find the fewest number of adders and
                  subtractors required for its implementation. We also
                  describe some hardware optimization techniques that can
                  reduce the gate-level area and delay of the CAVM design. It
                  is shown that the solutions of ECHO include significantly
                  less number of operations and yield less area in FIR filter
                  designs than those of previously proposed state-of-art
                  algorithms.",
  keywords =     "FIR, IIR filters; Constant Array Vector Multiplication
                  (CAVM); Multiplierless design; Cirucuit/hardware
                  optimization techniques; DSP; Gate-level implementation.",
  isbn =         "978-1-4799-3431-7",
  doi =          "10.1109/ISCAS.2014.6865420",
  annote =       "Poster"
}

@InProceedings{Aksoy-IWLS14,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "On the Multiplierless Design of Correctly Rounded Multiple
                  Constant Divisions",
  booktitle =    IWLS,
  pages =        " ??--?? ",
  location =     "San Francisco, USA",
  month =        MAY # " 30, ",
  year =         2014,
  abstract =     "The previously proposed algorithms designed for the
                  constant divisions use the multiply-add architecture which
                  is expensive in terms of area and power dissipation in
                  hardware. This paper introduces the problem of finding the
                  minimum number of adders/subtractors which realize the
                  correctly rounded multiple constant divisions (MCD) under
                  the shift-adds architecture. It proposes a depth-first
                  search method which can explore the whole search space,
                  ensuring the minimum solution. It also presents a local
                  search method which can handle the instances that the exact
                  algorithm cannot cope with due to the NP-completeness of
                  the MCD problem. Both algorithms are equipped with
                  techniques that maximize the sharing of common partial
                  products among the constant multiplications. To the best of
                  our knowledge, these are the first algorithms proposed for
                  the multiplierless design of the correctly rounded MCD.
                  Experimental results include the results of algorithms,
                  indicating that the local search method can find solutions
                  close to the minimum using little computational resources
                  and can obtain better solutions than a state-of-art
                  algorithm. They also include the results of MCD designs
                  under different architectures, showing that the MCD designs
                  under the shift-adds architecture occupy less area and
                  consume less power than those under the multiply-add
                  architecture.",
  keywords =     "Constant division; multiply-add architecture; shift-adds
                  architecture; rounded multiple constant divisions (MCD).",
  note =         PRESENTED*
}

@InProceedings{Aksoy-DATE14,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Optimization of Design Complexity in Time-Multiplexed
                  Constant Multiplications",
  booktitle =    DATE,
  publisher =    "IEEE",
  pages =        " 1--4 ",
  location =     "Dresden, Germany",
  month =        MAR # " 24--28,",
  year =         2014,
  abstract =     "The multiplication of constants by a data input is an
                  essential operation in digital signal processing (DSP)
                  systems. For applications requiring a large number of
                  constant multiplications under stringent hardware
                  constraints, it is generally realized under a folded
                  architecture, where a single constant selected from a set
                  of multiple constants is multiplied by the data input at
                  each time, called time-multiplexed constant multiplication
                  (TMCM). This paper addresses the problem of optimizing the
                  complexity of a TMCM design and introduces an algorithm
                  that finds the least complex TMCM design by sharing the
                  logic operators, i.e., adders, subtractors,
                  adders/subtractors, and multiplexors (MUXes). It includes
                  efficient search methods, yielding better results than
                  existing TMCM algorithms.",
  keywords =     "Time-Multiplexed Constant Multiplication (TMCM); Folded
                  architecture; shift-adds architecture optimization; Digital
                  Signal Processing (DSP).",
  doi =          "10.7873/DATE.2014.313",
  annote =       "Poster"
}

@InProceedings{Aksoy-VLSI-SoC13,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "Towards the Least Complex Time-Multiplexed Constant
                  Multiplication",
  booktitle =    VLSI-SOC,
  pages =        " 328--331 ",
  location =     "Istanbul, Turkey",
  month =        OCT # " 7--9, ",
  year =         2013,
  abstract =     "The multiplication of a variable by a single constant
                  selected from a set of fixed constants at a time, called
                  the time-multiplexed constant multiplication (TMCM), is
                  frequently used in digital signal processing (DSP) systems.
                  Existing algorithms implement the TMCM operation using
                  multiplexers (MUXes), adders/subtractors, and shifts, and
                  reduce its complexity by merging single/multiple constant
                  multiplication graphs and by sharing the basic structures.
                  This paper introduces ARION, that exploits the most common
                  partial terms in the TMCM design on top of the previously
                  proposed DAGfusion algorithm, which merges the single
                  constant multiplication graphs. Experimental results show
                  that ARION obtains significantly better solutions than
                  prominent TMCM methods.",
  doi =          "10.1109/VLSI-SoC.2013.6673302"
}

@InProceedings{Aksoy-ECCTD13,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Exploration of Tradeoffs in the Design of Integer Cosine
                  Transforms for Image Compression",
  booktitle =    ECCTD,
  pages =        " 1--4 ",
  location =     "Dresden, Germany",
  month =        SEP # " 8--12, ",
  year =         2013,
  abstract =     "In image data compression, integer cosine transforms (ICTs)
                  have been preferred to discrete cosine transforms (DCTs)
                  due to their similar transform efficiency and lower
                  implementation cost. However, there exist many alternative
                  ICTs with different performance measures and implementation
                  costs. In this work, we explore all possible ICTs, compute
                  their performance measures, and find their implementation
                  costs in terms of the number of adders/subtractors, where a
                  state-of-art technique is used to realize ICTs under a
                  shift-adds architecture. We also investigate the tradeoff
                  between performance and implementation cost, present the
                  pareto-optimal points of this tradeoff, and introduce
                  promising ICTs that were not considered before.",
  keywords =     "Integer Cosine Transforms (ICT); Discrete Cosine Transforms
                  (DCT); Shift-Adds Architecture; Multiplier-Less Hardware
                  Logic Gates.",
  doi =          "10.1109/ECCTD.2013.6662223"
}

@InProceedings{Brito-NEWCAS13,
  author =       "Diogo Brito and Jorge Fernandes and Paulo Flores and José
                  Monteiro",
  title =        "Standard {CMOS} Voltage-Mode {QLUT} Using a Clock Boosting
                  Technique",
  booktitle =    NEWCAS,
  pages =        " 1--4 ",
  location =     "Paris, France",
  month =        JUN # " 16--19, ",
  year =         2013,
  abstract =     "Interconnect has become preponderant in many aspects of
                  digital circuit design, namely delay, power and area. This
                  effect is particularly true for FPGAs, where
                  interconnection is often the most limiting factor.
                  Multiple-valued logic allows to reduce interconnections,
                  within logic cells and between them, hence effectively
                  mitigating the impact of interconnections. In this paper we
                  propose a new look-up table structure based on a low-power
                  high-speed quaternary voltage-mode device. Our quaternary
                  implementation overcomes the drawbacks of previously
                  proposed techniques by using a standard CMOS technology and
                  a clock boosting technique to enhance speed without
                  increasing consumption. Moreover, we present an ASIC
                  prototype of a full adder based on the designed look-up
                  table and experimental results are obtained and compared
                  with simulation. The prototype is designed to work at 100
                  MHz and it consumes 128 uW.",
  keywords =     "Quaternary Logic, Look-up Table, FPGA, Clock Boosting.",
  isbn =         "978-1-4799-0618-5",
  doi =          "10.1109/NEWCAS.2013.6573573"
}

@InProceedings{Neves-ASAP13,
  author =       "Nuno Neves and Nuno Sebastião and André Patrício and David
                  Matos and Pedro Tomás and Paulo Flores and Nuno Roma",
  title =        "{BioBlaze}: Multi-Core {SIMD} {ASIP} for {DNA} Sequence
                  Alignment",
  booktitle =    ASAP,
  publisher =    "IEEE",
  pages =        " 241--244 ",
  location =     "Washington D.C., USA",
  month =        JUN # " 5--7, ",
  year =         2013,
  abstract =     "A new Application-Specific Instruction-set Processor (ASIP)
                  architecture for biological sequences alignment is proposed
                  in this manuscript. This architecture achieves high
                  processing throughputs by exploiting both fine and
                  coarse-grained parallelism. The former is achieved by
                  extending the Instruction Set Architecture (ISA) of a
                  synthesizable processor to include multiple specialized
                  SIMD instructions that implement vector-vector and
                  vector-scalar arithmetic, logic, load/store and control
                  operations. Coarse-grained parallelism is achieved by using
                  multiple cores to cooperatively align multiple sequences in
                  a shared memory architecture, comprising proper
                  hardware-specific synchronization mechanisms. To ease the
                  programming, a compilation framework based on an adaptation
                  of the GCC back-end was also implemented. The proposed
                  system was prototyped and evaluated on a Xilinx Virtex-7
                  FPGA, achieving a 200MHz working frequency. A sequential
                  and a state-of-the-art SIMD implementations of the
                  Smith-Waterman algorithm were programmed in both the
                  proposed ASIP and an Intel Core i7 processor. When
                  comparing the achieved speedups, it was observed that the
                  proposed ISA achieves a 40x speedup, which contrasts with
                  the 11x speedup provided by SSE2 in the Intel Core i7
                  processor. The scalability of the multi-core system was
                  also evaluated and proved to scale almost linearly with the
                  number of cores.",
  issn =         "2160-0511",
  isbn =         "978-1-4799-0494-5",
  doi =          "10.1109/ASAP.2013.6567581",
  annote =       "Accepted as a poster."
}

@InProceedings{Marques-FLAIRS13,
  author =       "Ricardo Marques and Luis Guerra e Silva and Paulo Flores
                  and L. Miguel Silveira.",
  title =        "Improving {SAT} Solver Efficiency using a Multi-core
                  Approach",
  booktitle =    FLAIRS,
  pages =        " 94--99 ",
  location =     "Florida, USA",
  month =        MAY # " 22--24, ",
  year =         2013,
  abstract =     "Many practical problems in multiple fields can be converted
                  to a SAT problem, or a sequence of SAT problems, such that
                  their solution immediately implies a solution to the
                  original problem. Despite the enormous progress achieved
                  over the last decade in the development of SAT solvers,
                  there is strong demand for higher algorithm efficiency to
                  solve harder and larger problems. The widespread
                  availability of multi-core, shared memory parallel
                  environments provides an opportunity for such improvements.
                  In this paper we present our results on improving the
                  effectiveness of standard SAT solvers on such
                  architectures, through a portfolio approach. Multiple
                  instances of the same basic solver using different
                  heuristic strategies, for search-space exploration and
                  problem analysis, share information and cooperate towards
                  the solution of a given problem. Results from the
                  application of our methodology to known problems from SAT
                  competitions show relevant improvements over the state of
                  the art and yield the promise of further advances."
}

@InProceedings{Aksoy-GLSVLSI13,
  author =       "Levent Aksoy and Paulo Flores and José Monteiro",
  title =        "{SIREN}: A Depth-First Search Algorithm for the Filter
                  Design Optimization Problem",
  booktitle =    GLSVLSI,
  pages =        " 179--184 ",
  location =     "Paris, France",
  month =        MAY # " 2--3, ",
  year =         2013,
  abstract =     "This paper addresses the filter design optimization (FDO)
                  problem that is to find a set of filter coefficients which
                  yields the least design complexity while meeting the
                  required filter constraints. The design complexity of a
                  filter is defined in terms of the total number of
                  adders/subtracters, assuming that the multiplication of
                  coefficients by the filter input is realized under a
                  shift-adds architecture. Existing algorithms use efficient
                  search methods, but none of them can guarantee the minimum
                  design complexity. Hence, we propose an exact algorithm,
                  called SIREN, that finds an optimum solution of the FDO
                  problem under the minimum quantization value. It is based
                  on a depth-first search method equipped with an exact
                  technique, that finds the minimum number of
                  adders/subtracters in the multiplier block of the filter,
                  and search pruning techniques that enable it to be
                  applicable to practical instances. Experimental results
                  show that SIREN can still find better solutions than
                  efficient FDO algorithms and its solutions lead to filters
                  with significantly less area when compared to a
                  straightforward filter design technique.",
  keywords =     "Filter design optimization; Depth-first search; Linear
                  programming; Multiplierless filter design.",
  isbn =         "978-1-4503-2032-0",
  doi =          "10.1145/2483028.2483087"
}

@InProceedings{Brito-ICECS12,
  author =       "Diogo Brito and Jorge Fernandes and Paulo Flores and José
                  Monteiro",
  title =        "Design and Characterization of a {QLUT} in a Standard
                  {CMOS} Process",
  booktitle =    ICECS,
  pages =        " 288--291 ",
  location =     "Seville, Spain",
  month =        DEC # " 9--12, ",
  year =         2012,
  abstract =     "Interconnect has become preponderant in many aspects of
                  circuit design, namely delay, power and area. This effect
                  is particularly true for FPGAs, where interconnect is often
                  the most limiting factor. Quaternary logic offers a means
                  to reduce interconnect since each circuit wire can, in
                  principle, carry the same information as two binary wires.
                  We have proposed in [1] a design implementing a quaternary
                  low-power highspeed look-up table. The main features of
                  this circuit are being based on a voltage-mode structure
                  and using only standard CMOS technology. In this paper we
                  present the design of a prototype implementation and
                  experimental results. These results are discussed and
                  conclusions are drawn that provide further guidelines for
                  improvement.",
  keywords =     "Multi-value logic; Quaternary circuits; Circuit Layout,
                  Fabrication and Test.",
  isbn =         "978-1-4673-1261-5 (Print) 978-1-4673-1259-2 (Online)",
  doi =          "10.1109/ICECS.2012.6463744"
}

@InProceedings{Aksoy-ICCAD12-RefD,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Multiple Tunable Constant Multiplications: Algorithms and
                  Applications",
  booktitle =    ICCAD,
  pages =        " 473--479 ",
  location =     "San José, CA, USA",
  month =        NOV # " 5--8, ",
  year =         2012,
  abstract =     "The multiple constant multiplications (MCM) problem, that
                  is defined as finding the minimum number of addition and
                  subtraction operations required for the multiplication of
                  multiple constants by an input variable, has been the
                  subject of great interest since the complexity of many
                  digital signal processing (DSP) systems is dominated by an
                  MCM operation. This paper introduces a variant of the MCM
                  problem, called multiple tunable constant multiplications
                  (MTCM) problem, where each constant is not fixed as in the
                  MCM problem, but can be selected from a set of possible
                  constants. We present an exact algorithm that formalizes
                  the MTCM problem as a 0-1 integer linear programming (ILP)
                  problem when constants are defined under a number
                  representation. We also introduce a local search method for
                  the MTCM problem that includes an efficient MCM algorithm.
                  Furthermore, we show that these techniques can be used to
                  solve various optimization problems in finite impulse
                  response (FIR) filter design and we apply them to one of
                  these problems. Experimental results clearly show the
                  efficiency of the proposed methods when compared to
                  prominent algorithms designed for the MCM problem",
  keywords =     "MTCM - Multiple Tunable Constant Multiplications; 0-1 ILP;
                  Exact algorithm; Local search algorithm.",
  issn =         "1092-3152",
  doi =          "10.1145/2429384.2429482"
}

@InProceedings{Aksoy-DATE12,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Design of Low-Complexity Digital Finite Impulse Response
                  Filters on {FPGA}s",
  booktitle =    DATE,
  pages =        " 1197--1202 ",
  location =     "Dresden, Germany",
  month =        MAR # " 12--16, ",
  year =         2012,
  abstract =     "The multiple constant multiplications (MCM) operation,
                  which realizes the multiplication of a set of constants by
                  a variable, has a significant impact on the complexity and
                  performance of the digital finite impulse response (FIR)
                  filters. Over the years, many high-level algorithms and
                  design methods have been proposed for the efficient
                  implementation of the MCM operation using only addition,
                  subtraction, and shift operations. The main contribution of
                  this paper is the introduction of a high-level synthesis
                  algorithm that optimizes the area of the MCM operation and,
                  consequently, of the FIR filter design, on field
                  programmable gate arrays (FPGAs) by taking into account the
                  implementation cost of each addition and subtraction
                  operation in terms of the number of fundamental building
                  blocks of FPGAs. It is observed from the experimental
                  results that the solutions of the proposed algorithm yield
                  less complex FIR filters on FPGAs with respect to those
                  whose MCM part is implemented using prominent MCM
                  algorithms and design methods.",
  keywords =     "FIR filter design;FPGA;MCM algorithms;field programmable
                  gate arrays;high-level algorithms;low-complexity digital
                  finite impulse response filters design;multiple constant
                  multiplications operation;operations level synthesis
                  algorithm;shift operations;FIR filters;field programmable
                  gate arrays;high level synthesis.",
  issn =         "1530-1591",
  isbn =         "978-1-4577-2145-8",
  doi =          "10.1109/DATE.2012.6176675"
}

@InProceedings{Aksoy-VLSI-SoC11,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "A Hybrid Algorithm for the Optimization of Area and Delay
                  in Linear {DSP} Transforms",
  booktitle =    VLSI-SOC,
  pages =        " 148--153 ",
  location =     "Hong Kong, China",
  month =        OCT # " 3--5,",
  year =         2011,
  abstract =     "This paper addresses the problem of multiplierless
                  realization of linear transforms using the fewest number of
                  addition and subtraction operations and introduces a hybrid
                  algorithm that incorporates a graph-based technique, called
                  the difference method, and a Common Subexpression
                  Elimination (CSE) algorithm. In the proposed algorithm,
                  while the difference method extracts the most promising
                  realizations of linear transforms in each iteration, the
                  CSE algorithm achieves the most common minimum conflicting
                  subexpressions in each solution of the difference method.
                  This paper also describes how the hybrid algorithm can be
                  modified in order to find a solution with the fewest number
                  of operations under a delay constraint. The experimental
                  results on a comprehensive set of instances show the
                  efficiency of the hybrid algorithms, at both high-level and
                  gate-level, in comparison to previously proposed
                  algorithms.",
  keywords =     "Multiplierless Linear Transforms, Hybrid algorithms,
                  Graph-based techniques, Common Subexpression Elimination
                  (CSE), High-level and gate-level optimization.",
  isbn =         "978-1-4577-0171-9 / 978-1-4577-0169-6",
  doi =          "10.1109/VLSISoC.2011.6081637"
}

@InProceedings{Aksoy-ECCTD11,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Optimization of Gate-Level Area in High Throughput Multiple
                  Constant Multiplications",
  booktitle =    ECCTD,
  publisher =    "IEEE",
  pages =        " 588--591 ",
  location =     "Linköping, Sweden",
  month =        AUG # " 29--31, ",
  year =         2011,
  abstract =     "This paper addresses the problem of optimizing gate-level
                  area in a pipelined Multiple Constant Multiplications (MCM)
                  operation and introduces a high-level synthesis algorithm,
                  called HCUB-DC+ILP. In the HCUB-DC+ILP algorithm,
                  initially, a solution with the fewest number of operations
                  under a minimum delay constraint is found by the Hcub-DC
                  algorithm. Then, the area around this local minimum point
                  is explored exactly using a 0-1 Integer Linear Programming
                  (ILP) technique that considers the gate-level
                  implementation of the pipelined MCM operation. The
                  experimental results at both high-level and gate-level
                  clearly show the efficiency of HCUB-DC+ILP over previously
                  proposed prominent MCM algorithms.",
  keywords =     "Algorithm Design and Analysis, Optimization, Pipeline
                  Processing, Signal Processing Algorithms, Pipelined
                  Multiple Constant Multiplications (MCM), Integer Linear
                  Programming (ILP), Gate-Level Optimization, HCUB-DC+ILP and
                  Hcub-DC algorithm.",
  isbn =         "978-1-4577-0617-2 / 978-1-4577-0169-6",
  doi =          "10.1109/ECCTD.2011.6043602"
}

@InProceedings{Aksoy-ISCAS11,
  author =       "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and
                  Paulo Flores and José Monteiro",
  title =        "Optimization of Area in Digit-Serial Multiple Constant
                  Multiplications at Gate-Level",
  booktitle =    ISCAS,
  publisher =    "IEEE",
  pages =        " 2737-- 2740 ",
  location =     "Rio de Janeiro, Brazil",
  month =        MAY # " 15--18, ",
  year =         2011,
  abstract =     "The last two decades have seen many efficient algorithms
                  and architectures for the design of low-complexity
                  bit-parallel Multiple Constant Multiplications (MCM)
                  operation, that dominates the complexity of Digital Signal
                  Processing (DSP) systems. On the other hand, digit-serial
                  architectures offer alternative low-complexity designs,
                  since digit-serial operators occupy less area and are
                  independent of the data wordlength. This paper introduces
                  the problem of designing a digit-serial MCM operation with
                  minimal area at gate-level and presents the exact
                  formalization of the area optimization problem as a 0-1
                  Integer Linear Programming (ILP) problem. Experimental
                  results show the efficiency of the proposed algorithm and
                  digit-serial MCM designs in terms of area at gate-level.",
  keywords =     "Multiple Constant Multiplications (MCM), Digit-Serial
                  Multiple Constant Multiplication operation, Low-complexity
                  design, Area optimization, 0-1 Integer Linear Programming
                  problem, Area optimization problem, Gate-level
                  optimization.",
  issn =         "0271-4302",
  isbn =         "978-1-4244-9473-6 / 978-1-4244-9472-9",
  doi =          "10.1109/ISCAS.2011.5938171",
  annote =       "Accepted as poster"
}

@InProceedings{Aksoy-GLSVLSI11-mcmSerial,
  author =       "Levent Aksoy and Cristiano Lazzari and Eduardo Costa and
                  Paulo Flores and José Monteiro",
  title =        "Efficient Shift-Adds Design of Digit-Serial Multiple
                  Constant Multiplications",
  booktitle =    GLSVLSI,
  publisher =    "ACM",
  pages =        " 61--66 ",
  location =     "Lausanne, Switzerland",
  month =        MAY # " 2--4, ",
  year =         2011,
  abstract =     "The bit-parallel realization of the multiplication of a
                  variable by a set of constants using only
                  addition/subtraction and shift operations has been explored
                  extensively over the years, as large number of constant
                  multiplications dominate the complexity of many Digital
                  Signal Processing (DSP) systems. On the other hand,
                  digit-serial architectures offer alternative low-complexity
                  designs, since digit-serial operators occupy less area and
                  are independent of the data wordlength. This paper
                  introduces an approximate algorithm that targets the
                  optimization of the gate-level area of digit-serial
                  constant multiplications under a shift-adds architecture.
                  Experimental results on a comprehensive set of instances
                  indicate that the approximate algorithm presented in this
                  paper gives better solutions than the previously proposed
                  algorithms in terms of area at gate-level and yields
                  alternative low-complexity designs relatively to the
                  bit-parallel design. Furthermore, it is observed on the
                  digit-serial filter designs that the use of shift-adds
                  architecture yields area reduction up to 43.6\% with
                  respect to the filter designs that use generic digit-serial
                  constant multipliers.",
  keywords =     "Multiple constant multiplications (MCM), digit-serial
                  arithmetic, gate-level area optimization, finite impulse
                  response (FIR) filters.",
  isbn =         "978-1-4503-0667-6",
  doi =          "10.1145/1973009.1973023",
  annote =       "Tipo A no IST"
}

@InProceedings{Aksoy-GLSVLSI11-mcmLowpower,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Design of Low-Power Multiple Constant Multiplications Using
                  Low-Complexity Minimum Depth Operations",
  booktitle =    GLSVLSI,
  publisher =    "ACM",
  pages =        " 79--84 ",
  location =     "Lausanne, Switzerland",
  month =        MAY # " 2--4, ",
  year =         2011,
  abstract =     "Existing optimization algorithms for the multiplierless
                  realization of multiple constant multiplications (MCM)
                  typically target the minimization of the number of addition
                  and subtraction operations. Since power dissipation is
                  directly related to the amount of hardware, some power
                  reduction is indirectly achieved by these algorithms.
                  However, in many cases, glitching plays an equally
                  important role in defining the power consumption. This is
                  specially true for arithmetic circuits, and in particular
                  to MCM due to high logic depth and large number of
                  re-convergent paths. This paper introduces exact algorithms
                  that search the optimal area of an MCM design at gate-level
                  where each constant multiplication is implemented in its
                  minimum depth. Experimental results show that the proposed
                  algorithms lead to MCM designs consuming significantly less
                  power with respect to those obtained by the MCM
                  algorithms.",
  keywords =     "Multiple constant multiplications (MCM), 0-1 integer linear
                  programming, high-level synthesis, low-power MCM design.",
  isbn =         "978-1-4503-0667-6",
  doi =          "10.1145/1973009.1973026",
  annote =       "Tipo A no IST"
}

@InProceedings{Jaccottet-VLSI-SoC10,
  author =       "Diego Jaccottet and Eduardo Costa and Levent Aksoy and
                  Paulo Flores and José Monteiro",
  title =        "Design of Low-Complexity and High-Speed Digital Finite
                  Impulse Response Filters",
  booktitle =    VLSI-SOC,
  pages =        " 292--297 ",
  location =     "Madrid, Spain",
  month =        SEP # " 27--29,",
  year =         2010,
  abstract =     "In this paper, we introduce a design methodology to
                  implement low-complexity and high-speed digital Finite
                  Impulse Response (FIR) filters. Since FIR filters suffer
                  from a large number of constant multiplications, in the
                  proposed method the constant multiplications are replaced
                  by addition/subtraction and shift operations. Also, based
                  on the design objective, i.e., low-complexity or
                  high-speed, the addition/subtraction operations are
                  implemented using Ripple Carry Adder (RCA) or Carry-Save
                  Adder (CSA) architectures respectively. Furthermore,
                  high-level algorithms designed for the optimization of the
                  number of RCA and CSA blocks are used to reduce the
                  complexity of the FIR filter. Thus, a Computer-Aided Design
                  (CAD) tool that synthesizes low-complexity and high-speed
                  FIR filters in a shift-adds architecture is developed. It
                  is observed from the experimental results on FIR filter
                  instances that the developed CAD tool can find better FIR
                  filter designs in terms of area and delay than those
                  obtained using efficient general multipliers.",
  keywords =     "Multiple constant multiplications, addition and subtraction
                  architectures, low complexity FIR filter, ripple carry
                  adder, carry saver adder.",
  isbn =         "978-1-4244-6469-2",
  doi =          "10.1109/VLSISOC.2010.5642676"
}

@InProceedings{Aksoy-DSD10,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and José
                  Monteiro",
  title =        "Optimization of Area and Delay at Gate-Level in Multiple
                  Constant Multiplications",
  booktitle =    DSD,
  pages =        " 3--10 ",
  location =     "Lille, France",
  month =        SEP # " 1--3,",
  year =         2010,
  abstract =     "Although many efficient high-level algorithms have been
                  proposed for the realization of Multiple Constant
                  Multiplications (MCM) using the fewest number of addition
                  and subtraction operations, they do not consider the
                  low-level implementation issues that directly affect the
                  area, delay, and power dissipation of the MCM design. In
                  this paper, we initially present area efficient addition
                  and subtraction architectures used in the design of the MCM
                  operation. Then, we propose an algorithm that searches an
                  MCM design with the smallest area taking into account the
                  cost of each operation at gate-level. To address the area
                  and delay tradeoff in MCM design, the proposed algorithm is
                  improved to find the smallest area solution under a delay
                  constraint. The experimental results show that the proposed
                  algorithms yield low-complexity and high-speed MCM designs
                  with respect to those obtained by the prominent algorithms
                  designed for the optimization of the number of operations
                  and the optimization of area at gate-level.",
  keywords =     "Multiple constant multiplications, addition and subtraction
                  architectures, delay aware area optimization, gate-level
                  area optimization, graph-based algorithms.",
  isbn =         "78-1-4244-7839-2",
  doi =          "10.1109/DSD.2010.32"
}

@InProceedings{Sebastiao-HPCS10,
  author =       "Nuno Sebastião and Tiago Dias and Nuno Roma and Paulo
                  Flores",
  title =        "Integrated Accelerator Architecture for {DNA} Sequences
                  Alignment with Enhanced Traceback Phase",
  booktitle =    HPCS,
  pages =        " 16--23",
  location =     "Caen, France",
  month =        JUN # " 28--" # JUL # " 2, ",
  year =         2010,
  abstract =     "Dynamic programming algorithms are widely used to find the
                  optimal sequence alignment between any two DNA sequences.
                  This paper presents an innovative technique to
                  significantly reduce the computation time and memory space
                  requirements of the traceback phase of the Smith-Waterman
                  algorithm, together with a flexible and scalable hardware
                  architecture to accelerate the overall procedure. The
                  results obtained from an implementation using a Virtex-4
                  FPGA showed that the proposed technique is feasible and is
                  able to provide a significant speedup. For the considered
                  test sequences, a speedup of about 6000 was obtained.",
  keywords =     "DNA, Hardware Accelerator, Local Sequence Alignment,
                  Traceback.",
  isbn =         "978-1-4244-6827-0",
  doi =          "10.1109/HPCS.2010.5547154"
}

@InProceedings{Lazzari-ISCAS10,
  author =       "Cristiano Lazzari and Paulo Flores and José Monteiro and
                  Luigi Carro",
  title =        "Voltage-mode Quaternary {FPGA}s: An Evaluation of
                  Interconnections",
  booktitle =    ISCAS,
  pages =        " 869--972 ",
  location =     "Paris, France",
  month =        MAY # " 30--" # JUN # " 2, ",
  year =         2010,
  abstract =     "This work presents a study about FPGA interconnections and
                  evaluates their effects on voltage-mode binary and
                  quaternary FPGA structures. FPGAs are widely used due to
                  the fast time-to-market and reduced non-recurring
                  engineering costs in comparison to ASIC designs.
                  Interconnections play a crucial role in modern FPGAs,
                  because they dominate delay, power and area. The use of
                  multiple-valued logic allows the reduction of the number of
                  signals in the circuit, hence providing a mean to
                  effectively curtail the impact of interconnections. The
                  most important characteristic of the results are the
                  reduced fanout, fewer number of wires and the smaller wire
                  length presented by the quaternary devices. We use a set of
                  arithmetic circuits to compare binary and quaternary
                  implementations. This work presents the first step on
                  developing quaternary circuits by mapping any binary random
                  logic onto quaternary devices.",
  keywords =     "FPGAs, Mutiple-valued Logic, Low-power Quaternary
                  Voltage-mode Device, Interconnectios, Arithmetic Circuits.",
  isbn =         "978-1-4244-5308-5",
  doi =          "10.1109/ISCAS.2010.5537423"
}

@InProceedings{Lazzari-DATE10,
  author =       "Cristiano Lazzari and Paulo Flores and José Monteiro and
                  Luigi Carro",
  title =        "A New Quaternary {FPGA} Based on a Voltage-mode
                  Multi-valued Circuit",
  booktitle =    DATE,
  pages =        " 1797--1802 ",
  location =     "Dresden, Germany",
  month =        MAR # " 8--12, ",
  year =         2010,
  abstract =     "FPGA structures are widely used due to early time-to-market
                  and reduced non-recurring engineering costs in comparison
                  to ASIC designs. Interconnections play a crucial role in
                  modern FPGAs, because they dominate delay, power and area.
                  Multiple-valued logic allows the reduction of the number of
                  signals in the circuit, hence can serve as a mean to
                  effectively curtail the impact of interconnections. In this
                  work we propose a new FPGA structure based on a low-power
                  quaternary voltage-mode device. The most important
                  characteristics of the proposed architecture are the
                  reduced fanout, low number of wires and switches, and the
                  small wire length. We use a set of FIR filters as a
                  demonstrator of the benefits of the quaternary
                  representation in FPGAs. Results show a significant
                  reduction on power consumption.",
  keywords =     "FPGAs, Mutiple-valued Logic, Low-power Quaternary
                  Voltage-mode Device, FIR filters.",
  issn =         "1530-1591",
  isbn =         "978-1-4244-7054-9",
  doi =          "10.1109/DATE.2010.5457105"
}

@InProceedings{Lazzari-SCS09,
  author =       "Cristiano Lazzari and Paulo Flores and José Carlos
                  Monteiro",
  title =        "Power and Delay Comparison of Binary and Quaternary
                  Arithmetic Circuits",
  booktitle =    SCS,
  pages =        " 1--6",
  location =     "Jerba, Tunisia",
  month =        NOV # " 6--8, ",
  year =         2009,
  abstract =     "Interconnections play a crucial role in deep sub-micron
                  designs because they dominate the delay, power and area.
                  This is especially critical for modern million-gates FPGAs,
                  where as much as 90\% of chip area is devoted to
                  interconnections. Multiple-valued logic allows for the
                  reduction of the required number of signals in the circuit,
                  hence can serve as a means to effectively curtail the
                  impact of interconnections. We present in this paper a
                  comparison of binary and quaternary implementations of
                  arithmetic modules based on lookup table structures using a
                  voltage-mode circuits. Our assessment demonstrates that
                  significant power reduction is possible through the use of
                  quaternary structures, with very low delay penalties.",
  keywords =     "Quaternary Logic, Arithmetic Circuits, FPGA Synthesis,
                  Delay and Power Consumption.",
  isbn =         "978-1-4244-4397-0",
  doi =          "10.1109/ICSCS.2009.5412586"
}

@InProceedings{Aksoy-ICC09,
  author =       "Levent Aksoy and Ece Olcay Gunes and Paulo Flores",
  title =        "Optimization of Area under a Delay Constraint in Multiple
                  Constant Multiplications",
  booktitle =    ICC,
  publisher =    "World Scientific and Engineering Academy and Society
                  (WSEAS)",
  pages =        " 81--86",
  location =     "Rodos, Greece",
  month =        JUL # " 22--24, ",
  year =         2009,
  abstract =     "The Multiple Constant Multiplications (MCM), i.e., the
                  multiplication of a variable by a set of constants, has
                  been a central operation and performance bottleneck in many
                  digital signal processing applications such as, image and
                  video processing, digital television, and wireless
                  communications. Since the design of multiplications is
                  expensive in terms of area, delay, and power consumption in
                  hardware, the area-delay optimization of the MCM operation
                  has often been accomplished by using the shift-adds
                  architecture. However, most of the previously proposed
                  algorithms have focused on the optimization of area
                  ignoring the crucial tradeoff between area and delay of the
                  computation. In this paper, we introduce an approximate
                  algorithm that can find near optimal area solutions under
                  the user specified delay constraint. It is shown by the
                  experimental results that the proposed algorithm finds
                  better area-delay solutions than the previously proposed
                  efficient algorithms.",
  keywords =     "Multiple constant multiplications, area and delay
                  optimization, high-level synthesis, digital FIR filters.",
  issn =         "790-5117",
  isbn =         "978-960-474-096-3"
}

@InProceedings{Aksoy-NORCHIP08,
  author =       "Levent Aksoy and Ece Olcay Gunes and Paulo Flores",
  title =        "An Exact Breadth-First Search Algorithm for the Multiple
                  Constant Multiplications Problem",
  booktitle =    NORCHIP,
  pages =        " 41--46 ",
  location =     "Tallinn, Estonia",
  month =        NOV # " 16--17, ",
  year =         2008,
  abstract =     "This paper addresses the multiplication of one data sample
                  with multiple constants using addition/subtraction and
                  shift operations, i.e., the multiple constant
                  multiplications (MCM) problem. The MCM problem finds itself
                  and its variants in many applications, such as digital
                  finite impulse response (FIR) filters, linear signal
                  transforms, and computer arithmetic. Although many
                  efficient algorithms have been proposed to implement the
                  MCM using the fewest number of operations, they have been
                  heuristics, i.e., they cannot guarantee the minimum
                  solution. In this work, we propose an exact algorithm based
                  on the exhaustive search that finds the minimum number of
                  operations solution of an MCM problem. We have tested the
                  exact algorithm on a set of instances including FIR filter
                  and randomly generated instances, and compared with the
                  previously proposed efficient heuristics. It is observed
                  from the experimental results that, even though the exact
                  algorithm can be applied only on less complex instances, it
                  finds better solutions than the prominent heuristics when
                  applied on real size instances",
  keywords =     "Multiple constant multiplications problem, exact algorithm,
                  breadth-first exhaustive search.",
  isbn =         "978-1-4244-2492-4",
  doi =          "10.1109/NORCHP.2008.4738280",
  annote =       "Submitted ICCAD 2008-04-14, Rejected 2008-06-27"
}

@InProceedings{Daitx-SCS08,
  author =       "Fábio Daitx and Vagner Rosa and Eduardo Costa and Paulo
                  Flores and Sérgio Bampi",
  title =        "{VHDL} Generation of Optimized {FIR} Filters",
  booktitle =    SCS,
  pages =        " 1--5",
  location =     "Hammamet, Tunisia",
  month =        NOV # " 7--9, , ",
  year =         2008,
  abstract =     "This work proposes an VHDL generation software for
                  optimized FIR filters. In this paper a near optimum
                  algorithm for constant coefficient FIR filters was used.
                  This algorithm uses general coefficient representation for
                  the optimal sharing of partial products in Multiple
                  Constants Multiplications (MCM). The developed tool was
                  compared to Matlab FDA toolbox. Synthesis results show that
                  our tool is able to produce significantly better hardware
                  than FDA toolbox, doubling the speed and reducing the
                  silicon area by 75\%. The software produces a generic VHDL
                  output, synthesizable to ASIC or FPGA.",
  keywords =     "FIR filters; MCM optimization; FPGA implementation.",
  isbn =         "978-1-4244-2627-0",
  doi =          "10.1109/ICSCS.2008.4746944",
  url =          "http://www.emc-lab.net/Conferences/SCS2008/index.php",
  annote =       "Rejected-ISCAS08+VLSISoC07, Submitted-CSC08 on 2008-08-10"
}

@InProceedings{Morgado-PATMOS08,
  author =       "P. Marques Morgado and Paulo F. Flores and Jos{\'e} C.
                  Monteiro and L. Miguel Silveira",
  title =        "Generating Worst-Case Stimuli for Accurate Power Grid
                  Analysis",
  booktitle =    PATMOS,
  editor =       "Lars Svebsson and José Monteiro",
  publisher =    SPRINGER,
  pages =        " 247--257 ",
  location =     "Lisbon, Portugal",
  month =        SEP # " 12--14, ",
  year =         2008,
  abstract =     "Power distribution systems provide the voltages and
                  currents that devices in a circuit need to operate properly
                  and silicon success requires its careful design and
                  verification. However, problems like voltage drop, ground
                  bounce and electromigration which may cause chip failures,
                  are worsening, as more devices, operating at higher
                  frequencies, are placed closer together. Verification of
                  this type of systems is usually done by simulation, a
                  costly endeavor given the size of current grids, making the
                  determination of the worst-case input setting a crucial
                  task. Current methodologies are based on supposedly safe
                  settings targeting either unrealistic simultaneous
                  switching on all signals or heuristic accounts of the joint
                  switching probability of nearby signals. In this paper we
                  propose a methodology for computation of the worst-case
                  stimuli for power grid analysis. This is accomplished by
                  determining the input vector that maximizes the number of
                  gates in close proximity to each other that can switch in a
                  given time window. The addition of these temporal and
                  spatial restrictions makes the solution of the underlying
                  optimization problem feasible. Comparisons with existing
                  alternatives show that only a fraction of the gates change
                  in any given timing window, leading to a more robust and
                  efficient verification methodology.",
  keywords =     "Power Grid Analysis; Wort-case Stimuli.",
  issn =         "0302-9743",
  isbn =         "978-3-540-95947-2"
}

@InProceedings{Sebastiao-DSD08,
  author =       "Nuno Sebasti{\~a}o and Tiago Dias and Nuno Roma and Paulo
                  Flores and Leonel Sousa",
  title =        "Application Specific Programmable {IP} Core for Motion
                  Estimation: Technology Comparison Targeting Efficient
                  Embedded Co-Processing Units",
  booktitle =    DSD,
  pages =        " 181--188 ",
  location =     "Parma, Italy",
  month =        SEP # " 3--5, ",
  year =         2008,
  abstract =     "The implementation of a recently proposed IP core of an
                  efficient motion estimation co-processor is considered.
                  Some significant functional improvements to the base
                  architecture are proposed, as well as the presentation of a
                  detailed description of the interfacing between the
                  coprocessor and the main processing unit of the video
                  encoding system. Then, a performance analysis of two
                  distinct implementations of this IP core is presented,
                  considering two different target technologies: a high
                  performance FPGA device, from the Xilinx Virtex-II Pro
                  family, and an ASIC based implementation, using a 0.18um
                  CMOS StdCell library. Experimental results have shown that
                  the two alternative implementations have quite similar
                  performance levels and allow the estimation of motion
                  vectors in real-time",
  keywords =     "Motion Estimation IP core; Technology Comparison; Embedded
                  Co-Processing.",
  isbn =         "978-0-7695-3277-6",
  doi =          "10.1109/DSD.2008.66"
}

@InProceedings{Sebastiao-ACACES08,
  author =       "Nuno Sebasti{\~a}o and Tiago Dias and Nuno Roma and Paulo
                  Flores and Leonel Sousa",
  title =        "Specialized Motion Estimation Processor for Heterogeneous
                  Multicore Video Coding Systems",
  booktitle =    "International Summer School on Advanced Computer
                  Architecture and Compilation for Embedded Systems (ACACES)
                  Poster Abstracts",
  pages =        " 63--66 ",
  location =     "L'Aquila, Italy",
  month =        JUL # " 13--19, ",
  year =         2008,
  abstract =     "In the last few years, several highly efficient processor
                  architectures (e.g.: ARM, Power-PC, etc.) have been
                  proposed to implement computational demanding multimedia
                  applications. This paper proposes to use these capabilities
                  to extend the main processor computational resources, in
                  order to efficiently execute one of the most critical parts
                  of the video coding algorithm: motion estimation. This
                  extension is accomplished by embedding a specialized
                  Application Specific Instruction Set Processor (ASIP) core
                  for motion estimation that works together with a General
                  Purpose Processor in a heterogeneous multicore
                  environment.",
  keywords =     "Motion Estimation; Multicore; Video Encoding.",
  isbn =         "978-9-038-21288-3",
  annote =       "Summer School: short paper + poster"
}

@InProceedings{Aksoy-SIPS07,
  author =       "Levent Aksoy and Ece Olcay Gunes and Eduardo Costa and
                  Paulo Flores and José Monteiro",
  title =        "Effect of Number Representation on the Achievable Minimum
                  Number of Operations in Multiple Constant Multiplications",
  booktitle =    SIPS,
  pages =        " 424--429 ",
  location =     "Shangai, China",
  month =        OCT # " 17--19, ",
  year =         2007,
  abstract =     "In this work, we analyze the effect of representing
                  constants under binary, CSD, and MSD representations on the
                  minimum number of operations required in a multiple
                  constant multiplications problem. To this end, we resort to
                  a recently proposed algorithm that computes the exact
                  minimum solution. To extend the applicability of this
                  algorithm to much larger instances, we propose problem
                  reduction and model simplification techniques that
                  significantly reduce the search space. We have conducted
                  experiments on a rich set of instances including randomly
                  generated and FIR filter instances. The results show that,
                  contrary to common belief, the binary representation
                  clearly yields better solutions than CSD, and even provides
                  slightly better solutions than MSD. Moreover, the
                  superiority of the binary solutions increases as the number
                  and bit-width of the constants increase.",
  keywords =     "Canonical Signed Digit (CSD), Common Subexpression
                  Elimination (CSE), Minimal Signed Digit (MSD), Multiple
                  Constant Multiplication (MCM).",
  issn =         "1520-6130",
  isbn =         "978-1-4244-1222-8",
  doi =          "10.1109/SIPS.2007.4387585",
  annote =       "IF=, Citations(Scholar)=1, Citations(ISI)=, "
}

@InProceedings{Aksoy-ECCTD07,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and
                  Jos{\'e} Monteiro",
  title =        "Minimum Number of Operations under a General Number
                  Representation for Digital Filter Synthesis",
  booktitle =    ECCTD,
  pages =        " 252--255 ",
  location =     "Seville, Spain",
  month =        AUG # " 26--30, ",
  year =         2007,
  abstract =     "In this work, we introduce an algorithm for the
                  optimization of the number of operations in the multiplier
                  block of a digital filter based on a general number
                  representation for the coefficients. In common
                  subexpression elimination algorithms, constants are
                  generally represented with the minimum number of non-zero
                  digits based on their CSD, or MSD representations. We
                  observe that these representations may yield a solution far
                  from the minimum. The general number representation used in
                  our algorithm considers a much larger set of alternative
                  implementations of a constant, which includes the CSD and
                  MSD representations. To cope with the increased search
                  space, we propose model simplification and problem
                  reduction techniques. In this paper, we show that the
                  proposed exact algorithm using general number
                  representation achieves a significant reduction in the
                  number of operations, which can be up to 15\% with respect
                  to the solutions obtained under MSD representation.",
  isbn =         "1-4244-1342-7",
  doi =          "10.1109/ECCTD.2007.4529584",
  annote =       "IF=, Citations(Scholar)=3, Citations(ISI)=, "
}

@InProceedings{Aksoy-DAC07,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and
                  Jos{\'e} Monteiro",
  title =        "Optimization of Area in Digital {FIR} Filters using
                  Gate-Level Metrics",
  booktitle =    DAC,
  publisher =    "ACM Press",
  address =      "New York, NY, USA",
  pages =        " 420--423 ",
  location =     "San Diego, California",
  month =        JUN # " 4--8, ",
  year =         2007,
  abstract =     "In the paper, we propose a new metric for the minimization
                  of area in the generic problem of multiple constant
                  multiplications, and demonstrate its effectiveness for
                  digital FIR filters. Previous methods use the number of
                  required additions or subtractions as a cost function. We
                  make the observation that not all of these operations have
                  the same design cost. In the proposed algorithm, a minimum
                  area solution is obtained by considering area estimates for
                  each operation. To this end, we introduce accurate hardware
                  models for addition and subtraction operations in terms of
                  gate-level metrics, under both signed and unsigned
                  representations. Our algorithm not only computes the best
                  design solution among those that have the same number of
                  operations, but is also able to find better area solutions
                  using a non-minimum number of operations. The results
                  obtained by the proposed exact algorithm are compared with
                  the results of the exact algorithm designed for the minimum
                  number of operations on FIR filters and it is shown that
                  the area of the design can be reduced by up to 18\%.
                  General Terms Algorithms, design.",
  keywords =     "FIR, Multiple Constant Multiplication, Area Optimization.",
  issn =         "0738-100X",
  isbn =         "978-1-59593-627-1",
  doi =          "http://doi.acm.org/10.1145/1278480.1278588",
  annote =       "IF=, Citations(Scholar)=4, Citations(ISI)=, "
}

@InProceedings{Morgado-ISVLSI07,
  author =       "P. Marques Morgado and Paulo F. Flores and L. Miguel
                  Silveira",
  title =        "Generating Realistic Stimuli for Accurate Power Grid
                  Analysis",
  booktitle =    ISVLSI,
  pages =        " 233--238 ",
  location =     "Porto Alegre",
  month =        MAR # " 9--11, ",
  year =         2007,
  abstract =     "Power distribution systems in integrated circuits are used
                  to provide the voltages and currents the devices need to
                  operate properly. As the semiconductor industry moves into
                  deep nanometer nodes, problems like voltage drop, ground
                  bounce and electromigration which may cause chip failures,
                  are worsening, as more devices, operating at higher
                  frequencies are placed closer together. Verification of a
                  power distribution system is therefore paramount to silicon
                  success. This type of verification is usually done by
                  simulation, targeting a worst-case scenario, typically
                  characterized by the almost simultaneous switching of
                  several devices in the circuit. The definition of the
                  worst-case situation is therefore crucial, since it
                  influences the result of the simulation and ultimately the
                  design target. Supposedly safe but unrealistic settings
                  such as assuming that all signals switch simultaneously,
                  could lead to costly over-designs in terms of die area. In
                  this paper we describe a software tool that generates a
                  reasonable, realistic, worst-case set of stimuli for
                  simulation, based on timing and spatial restrictions that
                  arise from the circuit's netlist and placement. Generating
                  such stimuli is akin to performing a standard static timing
                  analysis, as is done before signoff so the tool fits well
                  within conventional design frameworks. The resulting
                  stimuli indicates that only a fraction of the gates change
                  in any given timing window, leading to a more robust
                  verification methodology, specially in the dynamic case.",
  keywords =     "Circuit CAD, Formal Verification, Integrated Circuit
                  Design, Power Distribution, Power Grids, Software Tools,
                  Timing.",
  isbn =         "0-7695-2896-1",
  doi =          "10.1109/ISVLSI.2007.47",
  annote =       "IF=, Citations(Scholar)=, Citations(ISI)=, "
}

@InProceedings{Aksoy-ICECS06,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and
                  Jos{\'e} Monteiro",
  title =        "{ASSUME}s: Heuristic Algorithms for Optimization of Area
                  and Delay in Digital Filter Synthesis",
  booktitle =    ICECS,
  pages =        " 748--751 ",
  month =        DEC # " 10--13, ",
  year =         2006,
  abstract =     "In this work two heuristic algorithms are presented for the
                  problems of optimization of area and optimization of area
                  under a delay constraint in digital filter synthesis. The
                  heuristics search for a solution on a combinational network
                  that represents a covering problem using a greedy method
                  for partial term selection. The methods start from the
                  outputs towards the inputs for each coefficient. This
                  top-down approach considers a much larger solution space
                  than existing bottom-up heuristic algorithms. We present
                  results on a wide range of instances and compare them with
                  exact and prominent heuristic algorithms. The results
                  demonstrate that the solutions obtained by the proposed
                  heuristics are extremely close to the exact solutions and
                  are significantly better than the existing heuristic
                  algorithms.",
  isbn =         "1-4244-0395-2",
  doi =          "10.1109/ICECS.2006.379897",
  annote =       "IF=, Citations(Scholar)=1, Citations(ISI)=, "
}

@InProceedings{Costa-SBCCI06,
  author =       "Eduardo Costa and Paulo Flores and Jos{\'e} Monteiro",
  title =        "Exploiting General Coefficient Representation for the
                  Optimal Sharing of Partial Products in {MCM}s",
  booktitle =    SBCCI,
  publisher =    "ACM Press",
  address =      "New York, NY, USA",
  pages =        " 161--166 ",
  location =     "Ouro Preto, MG, Brazil",
  month =        AUG # " 28--31, ",
  year =         2006,
  abstract =     "We propose a new algorithm that maximizes he sharing of
                  partial terms in Multiple Cons an Multiplication (MCM)
                  operations under a general number representation for the
                  coefficients. MCM operations are required by many
                  algorithms in digital signal processing and have been the
                  subject of extensive research. By making no assumptions as
                  to the number representation, the algorithm described in
                  his paper is able to perform a better search for the
                  optimal sharing of partial terms than previous methods
                  based on MSD or CSD representations. We have applied our
                  algorithm for the hardware minimization of FIR filers.The
                  results show that we can ob ain solutions that require
                  between 20\% to 50\% less hardware when compared agains he
                  solutions using he MSD representation.",
  keywords =     "Common Subexpression Elimination (CSE), Digital Filter
                  Design, Minimal Signed Digit (MSD), Multiple Constant
                  Multiplication (MCM).",
  isbn =         "1-59593-479-0",
  doi =          "http://doi.acm.org/10.1145/1150343.1150387",
  annote =       "IF=, Citations(Scholar)=1, Citations(ISI)=, "
}

@InProceedings{Aksoy-DAC06,
  author =       "Levent Aksoy and Eduardo Costa and Paulo Flores and
                  Jos{\'e} Monteiro",
  title =        "Optimization of Area under a Delay Constraint in Digital
                  Filter Synthesis Using {SAT}-Based Integer Linear
                  Programming",
  booktitle =    DAC,
  publisher =    "ACM Press",
  address =      "New York, NY, USA",
  pages =        " 669--674 ",
  location =     "San Francisco, CA, USA",
  month =        JUL # " 10--14, , ",
  year =         2006,
  abstract =     "In this paper, we propose an exact algorithm for the
                  problem of area optimization under a delay constraint in
                  the synthesis of multiplierless FIR filters. To the best of
                  our knowledge, the method presented in this paper is the
                  only exact algorithm designed for this problem. We present
                  the results of the algorithm on real-sized filter instances
                  and compare with an improved version of a recently proposed
                  exact algorithm designed for the minimization of area. We
                  show that in many cases delay can be minimized without any
                  area penalty. Additionally, we describe two approximate
                  algorithms that can be applied to instances which cannot be
                  solved, or take too long, with the exact algorithm. We show
                  that these algorithms find similar solutions to the exact
                  algorithm in less CPU time.",
  issn =         "0738-100X",
  isbn =         "1-59593-381-6",
  doi =          "http://doi.acm.org/10.1145/1146909.1147079",
  annote =       "IF=, Citations(Scholar)=2, Citations(ISI)=, "
}

@InProceedings{Flores-ICCAD05,
  author =       "Paulo Flores and Jos{\'e} Monteiro and Eduardo Costa",
  title =        "An Exact Algorithm for the Maximal Sharing of Partial Terms
                  in Multiple Constant Multiplication",
  booktitle =    ICCAD,
  publisher =    "IEEE Computer Society",
  address =      "Washington, DC, USA",
  pages =        " 13--16 ",
  location =     "San Jose, CA",
  month =        NOV # " 6--10, ",
  year =         2005,
  abstract =     "In this paper, we propose an exact algorithm that maximizes
                  the sharing of partial terms in multiple constant
                  multiplication (MCM) operations. We model this problem as a
                  Boolean network that covers all possible partial terms
                  which may be used to generate the set of coefficients in
                  the MCM instance. The PIs to this network are shifted
                  versions of the MCM input. An AND gate represents an adder
                  or a subtracter, i.e., an AND gate generates a new partial
                  term. All partial terms that have the same numerical value
                  are ORed together. There is a single output which is an
                  /spl and/ over all the coefficients in the MCM. We cast
                  this problem into a 0-1 integer linear programming (ILP)
                  problem by requiring that the output is asserted while
                  minimizing the total number of AND gates that evaluate to
                  one. A SAT-based solver is used to obtain the exact
                  solution. We argue that for many real problems the size of
                  the problem is within the capabilities of current SAT
                  solvers. We present results using binary, CSD and MSD
                  representations. Two main conclusions can be drawn from the
                  results. One is that, in many cases, existing heuristics
                  perform well, computing the best solution, or one close to
                  it. The other is that the flexibility of the MSD
                  representation does not have a significant impact in the
                  solution obtained.",
  isbn =         "0-7803-9254-X",
  doi =          "10.1109/ICCAD.2005.1560032",
  annote =       "IF=, Citations(Scholar)=20, Citations(ISI)=, "
}

@InProceedings{Costa-ECCTD05,
  author =       "Eduardo da Costa and Paulo Flores and Jos{\'e} Monteiro",
  title =        "Maximal Sharing of Partial Terms in {MCM} Under Minimal
                  Signed Digit Representation",
  booktitle =    ECCTD,
  volume =       "II",
  pages =        " 221--224 ",
  month =        AUG # " 28--" # SEP # " 2, ",
  year =         2005,
  abstract =     "We propose a new algorithm that maximizes the sharing of
                  partial terms in Multiple Constant Multiplication (MCM)
                  operations. MCM operations are required by many algorithms
                  in digital signal processing and have been the subject of
                  extensive research. Recently, the Minimal Signed Digit
                  (MSD) number representation has been proposed as an
                  extension to the Canonical Signed Digit (CSD)
                  representation. By properly exploiting the redundancy of
                  the MSD representation, the hardware implementation can be
                  significantly optimized. The initial algorithm described in
                  this paper is able to perform a better search for the
                  optimal sharing of the redundant coefficient
                  representations under MSD than previous methods. However,
                  during its search the depth of adder-steps is not
                  considered. We present a modified version of this algorithm
                  that is able to reduce the maximum depth of partial terms
                  at the expense of some extra hardware. The results show
                  that for more complex problems our algorithm performs
                  significantly better than previous approaches, in some
                  cases obtaining solutions that require 25\% less hardware.",
  keywords =     "Adders, Digital Arithmetic, Digital Signals, Multiplying
                  Circuits.",
  isbn =         "0-7803-9066-0",
  doi =          "10.1109/ECCTD.2005.1523033",
  annote =       "IF=, Citations(Scholar)=4, Citations(ISI)=, "
}

@InProceedings{Flores-ISCAS99,
  author =       "Paulo Flores and Hor{\'a}cio Neto and Krishnendu
                  Chakrabarty and Jo{\~a}o Marques-Silva",
  title =        "Test pattern generation for width compression in {BIST}",
  booktitle =    ISCAS,
  volume =       1,
  pages =        " 114--118 ",
  location =     "Orlando, FL, USA",
  month =        MAY # " 30 -- " # JUN # " 2, ",
  year =         1999,
  abstract =     "The main objectives of built-in self test (BIST) are the
                  design of test pattern generator circuits which achieve the
                  highest fault coverage, require the shortest sequence of
                  test vectors and utilize the minimum circuit area. This
                  paper targets the problem of generating test patterns for
                  stuck-at faults that induce compatibility relations between
                  the primary inputs of the circuit under test. These
                  compatibility relations can be used for designing
                  counter-based test generator circuits with a reduced number
                  of bits, thus requiring smaller testing time and smaller
                  area. The proposed solution is based on an integer linear
                  programming (ILP) formulation that builds on existing
                  propositional satisfiability (SAT) models for test pattern
                  generation. An ATPG tool for minimum test pattern
                  generation for width compression (MTP-C) is described,
                  which illustrates the practical applicability of our
                  approach for a wide range of benchmark circuits",
  keywords =     "VLSI, Automatic Test Pattern Generation, Built-In Self
                  Test, Fault Diagnosis, Integer Programming, Linear
                  Programming, Logic Testing; BIST, Counter-Based Test
                  Generator Circuits, Fault Coverage, Integer Linear
                  Programming, Minimum Circuit Area, Minimum Test Pattern
                  Generation, Propositional Satisfiability Models, Width
                  Compression.",
  isbn =         "0-7803-5471-0",
  doi =          "10.1109/ISCAS.1999.777818"
}

@InProceedings{Flores-GLS99,
  author =       "Paulo Flores and Hor{\'a}cio Neto and Jo{\~a}o
                  Marques-Silva",
  title =        "On Applying Set Covering Models to Test Set Compaction",
  booktitle =    GLS,
  publisher =    "IEEE Computer Society",
  pages =        " 8--11 ",
  location =     "Ypsilanti, MI",
  month =        MAR # " 4--6, ",
  year =         1999,
  abstract =     "Test set compaction is fundamental problem in digital
                  system testing. In recent years, many competitive solutions
                  have been proposed, most of which based on heuristics
                  approaches. This paper studies the application of set
                  covering models to the compaction of test sets, which can
                  be used with any heuristic test set compaction procedure.
                  For this purpose, recent and highly effective set covering
                  algorithms are used. Experimental evidence suggests that
                  the size of computed test sets can often be reduced by
                  using set covering models and algorithms. Moreover a
                  noteworthy empirical conclusion is that it may be
                  preferable not to use fault simulation when the final
                  objective is test set compaction",
  keywords =     "Automatic Test Pattern Generation, Integrated Circuit
                  Testing, Logic Testing, Minimisation.",
  doi =          "10.1109/GLSV.1999.757365"
}

@InProceedings{Flores-VLSID99,
  author =       "Paulo Flores and Jos{\'e} Costa and Hor{\'a}cio Neto and
                  Jos{\'e} Monteiro and Jo{\~a}o Marques-Silva",
  title =        "Assignment and Reordering of Incompletely Specified Pattern
                  Sequences Targetting Minimum Power Dissipation",
  booktitle =    "VLSID '99: Proceedings of the 12th International Conference
                  on VLSI Design - 'VLSI for the Information Appliance'",
  publisher =    "IEEE Computer Society",
  address =      "Washington, DC, USA",
  pages =        " 37--41 ",
  location =     "Goa, India",
  month =        JAN # " 10--13, ",
  year =         1999,
  isbn =         "0-7695-0013-7"
}

@InProceedings{Flores-ICCD98,
  author =       "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P.
                  Marques-Silva",
  title =        "An exact solution to the minimum size test pattern problem",
  booktitle =    ICCD,
  pages =        " 510--515 ",
  location =     "Austin, TX",
  month =        OCT # " 5--7, ",
  year =         1998,
  abstract =     "This paper addresses the problem of test pattern generation
                  for single stuck-at faults in combinational circuits, under
                  the additional constraint that the number of specified
                  primary input assignments is minimized. This problem has
                  different applications in testing including the
                  identification of don't care conditions to be used in the
                  synthesis of Built-In Self- Test (BIST) logic. The proposed
                  solution is based on an integer linear programming (ILP)
                  formulation which builds on an existing propositional
                  satisfiability (SAT) model for test pattern generation. The
                  resulting ILP formulation is linear on the size of the
                  original SAT model for test generation, which is linear on
                  the size of the circuit. Nevertheless, the resulting ILP
                  instances represent complex optimization problems, that
                  require dedicated ILP algorithms. Preliminary results on
                  benchmark circuits validate the practical applicability of
                  the test pattern minimization model and associated ILP
                  algorithm",
  keywords =     "Automatic Test Pattern Generation, Built-In Self Test,
                  Combinational Circuits, Computability, Integer Programming,
                  Linear Programming, Logic Testing.",
  doi =          "10.1109/ICCD.1998.727097"
}

@InProceedings{Flores-IWLS98-power,
  author =       "Jos{\'e} C. Costa and Paulo F. Flores and Hor{\'a}cio C.
                  Neto and Jos{\'e} C. Monteiro and Jo{\~a}o P.
                  Marques-Silva",
  title =        "Power Reduction in {BIST} by Exploiting Don't Cares in Test
                  Patterns",
  booktitle =    IWLS,
  pages =        " 375--386 ",
  month =        JUN,
  year =         1998
}

@InProceedings{Flores-IWLS98-mtp,
  author =       "Paulo F. Flores and Hor{\'a}cio C. Neto and Jo{\~a}o P.
                  Marques-Silva",
  title =        "An Exact Solution to the Minimum-Size Test Pattern Problem",
  booktitle =    IWLS,
  pages =        " 452--470 ",
  month =        JUN,
  year =         1998
}

@InProceedings{Flores-ETW98-power,
  author =       "Jos{\'e} C. Costa and Paulo F. Flores and Hor{\'a}cio C.
                  Neto and Jos{\'e} C. Monteiro and Jo{\~a}o P.
                  Marques-Silva",
  title =        "Exploiting Don't Cares in Test Patterns to Reduce Power
                  During {BIST}",
  booktitle =    ETW,
  pages =        " 34--36 ",
  month =        MAY,
  year =         1998
}

@InProceedings{Flores-ETW98-mtp,
  author =       "Paulo F. Flores and Hor{\'a}cio C. Neto and Krishnendu
                  Chakrabarty and Jo{\~a}o P. Marques-Silva",
  title =        "A Model and Algorithm for Computing Minimum-Size Test
                  Patterns",
  booktitle =    ETW,
  pages =        " 147--148 ",
  month =        MAY,
  year =         1998
}

@InProceedings{Flores-CTC98-power,
  author =       "Jos{\'e} C. Costa and Paulo F. Flores and Jos{\'e} C.
                  Monteiro and Jo{\~a}o P. Marques-Silva",
  title =        "Power Reduction in {BIST} by Exploiting Don't Cares in Test
                  Patterns",
  booktitle =    CTC,
  month =        MAY,
  year =         1998
}

@InProceedings{Flores-CTC98-mtp,
  author =       "Paulo F. Flores and Hor{\'a}cio C. Neto and Krishnendu
                  Chakrabarty and Jo{\~a}o P. Marques-Silva",
  title =        "A Model and Algorithm for Computing Minimum-Size Test
                  Patterns",
  booktitle =    CTC,
  month =        MAY,
  year =         1998
}

@InProceedings{Manquinho-ICTAI97,
  author =       "Vasco Manquinho and Paulo Flores and Jo{\~a}o Marques-Silva
                  and Arlindo L. Oliveira",
  title =        "Prime Implicant Computation Using Satisfiability
                  Algorithms",
  booktitle =    ICTAI,
  pages =        " 232--239 ",
  location =     "Newport Beach, CA",
  month =        NOV # " 3--8, ",
  year =         1997,
  abstract =     "The computation of prime implicants has several and
                  significant applications in different areas, including
                  automated reasoning, non-monotonic reasoning, electronic
                  design automation, among others. The authors describe a new
                  model and algorithm for computing minimum-size prime
                  implicants of propositional formulas. The proposed approach
                  is based on creating an integer linear program (ILP)
                  formulation for computing the minimum-size prime implicant,
                  which simplifies existing formulations. In addition, they
                  introduce two new algorithms for solving ILPs, both of
                  which are built on top of an algorithm for propositional
                  satisfiability (SAT). Given the organization of the
                  proposed SAT algorithm, the resulting ILP procedures
                  implement powerful search pruning techniques, including a
                  non-chronological backtracking search strategy, clause
                  recording procedures and identification of necessary
                  assignments. Experimental results, obtained on several
                  benchmark examples, indicate that the proposed model and
                  algorithms are significantly more efficient than other
                  existing solutions",
  keywords =     "Backtracking, Computability, Integer Programming, Linear
                  Programming, Nonmonotonic Reasoning, Search Problems.",
  doi =          "10.1109/TAI.1997.632261"
}

@InProceedings{Flores-DCIS96,
  author =       "Paulo Flores and Hor{\'a}cio Neto",
  title =        "Register Transfer Level {VHDL} Block Generation",
  booktitle =    DCIS,
  month =        NOV,
  year =         1996
}