fpga.bib

% Encoding: UTF-8

@WWW{,
  author    = {Tim VERRY},
  editor    = {PC Perspective},
  title     = {Apple’s A6 processor uses hand drawn ARM cores to boost performance},
  date      = {2012-09-27},
  url       = {https://pcper.com/2012/09/apples-a6-processor-uses-hand-drawn-arm-cores-to-boost-performance/},
  keywords  = {ARM, A15, Apple, A6, iPhone 5,hand drawn},
  review    = {- Texas Instruments is giving up on ARM chips for tablets and smartphones
- hand drawn designs are becoming increasingly rare
- AMD has given up hand drawn design with Steamroller
- Apple and the engineers acquired from its purchase of PA Semi have manually drawn out the processor by hand},
  timestamp = {2019-06-08},
}

@WWW{,
  editor    = {Wikipedia},
  title     = {P.A. Semi},
  year      = {2013},
  url       = {https://en.wikipedia.org/wiki/P.A._Semi},
  keywords  = {fabless, semiconductor, acquisition, Apple},
  review    = {- On 23 April 2008, Apple announced that they had acquired P.A. Semi.
- On 11 June 2008, during the annual Worldwide Developer’s Conference, Apple CEO Steve Jobs said that the acquisition was meant to add the talent of P.A. Semi’s engineers to Apple’s workforce and help them build custom chips for the iPod, iPhone and other future mobile devices such as the iPad.},
  timestamp = {2019-06-08},
}

@Electronic{,
  author    = {Charles WEBER and C. Neil BERGLUND and Patricia GABELLA},
  editor    = {Portland State University},
  title     = {Mask cost and profitability in photomask manufacturing, an empirical analysis},
  date      = {2006-11},
  url       = {http://web.pdx.edu/~webercm/documents/2006%20November%20IEEE%20TSM%20Weber%20Berglund%20Gabella.pdf},
  abstract  = {An empirical study of the economics of manufacturing photomasks concludes that the
uncontrolled growth of optical proximity effect correction and resolution enhancement techniques is
driving up the cost of pattern generation and mask inspection to levels that threaten the profitability of
photomask manufacturing. The intrinsic cost of some leading edge photomasks has already exceeded
the price that customers are willing to pay for them. A model of the lifecycle of photomask manufacturing,
developed from interviews involving the 1990 to 2005 operations of six mask shops and a survey of
seven photomask manufacturers, shows that design for manufacturability (DFM) constitutes the most
promising approach for alleviating this market impasse. Unilateral action by mask shops to increase their
capital productivity is necessary but insufficient and perhaps unaffordable. DFM solutions will require the
majority of participants in the lithography value chain to collaborate according to a volatile demand
schedule that is driven by semiconductor manufacturers.},
  keywords  = {masks, costs, profitability, photomask, manufacturing},
  review    = {- In 2006, the cost of a mask set is ~US$ 500k for the 130 nm technology
- 65 nm node is expected to be ~US$ 2 million
- 90 nm node is expected to be ~US$ 1 million},
  timestamp = {2019-06-08},
}

@Electronic{,
  author    = {A. B. Kahng and Y. C. Pati},
  editor    = {UCLA Department of Computer Science and Numerical Technologies, Inc},
  title     = {Subwavelength optical lithography, challenges and impact on physical design},
  date      = {1999-04-15},
  url       = {https://vlsicad.ucsd.edu/Publications/Conferences/94/c94.pdf},
  abstract  = {We review the implications of subwavelength optical lithography for new tools and ows in the interface between layout design and manufacturability. After discussing the necessity of corrections for optical process effects (i.e., use of optical proximity correction (OPC) and phase-shifting masks (PSM)), we focus on the implications of OPC and PSM for layout and verication methodologies. Our discussion addresses the necessary changes in the design-to- manufacturing ow, including infrastructure development in the mask and process communities as well as opportunities for research and development in physical layout and verification.},
  timestamp = {2019-06-08},
}

@Electronic{,
  author    = {Unknown},
  editor    = {Unknown},
  title     = {6502 Schematic},
  date      = {2007-11-14},
  url       = {https://downloads.reactivemicro.com/Electronics/CPU/6502%20Schematic.pdf},
  keywords  = {6502, schematic},
  review    = {- it shows how instruction are decoded},
  timestamp = {2019-06-09},
}

@WWW{,
  author   = {Russ Cox},
  title    = {The MOS 6502 and the Best Layout Guy in the World},
  date     = {2011-01-03},
  url      = {https://research.swtch.com/6502},
  abstract = {What are the key designs of the 6502 compared to other processors of its time},
}

@Electronic{,
  editor    = {Computer History Museum},
  title     = {Oral history panel on the development and promotion of the Motorola 68000},
  date      = {2007-07-23},
  url       = {https://archive.computerhistory.org/resources/access/text/2012/04/102658164-05-01-acc.pdf},
  keywords  = {68000, motorola},
  review    = {- [For the 68000] All of that was done by hand. We didn't even need graphic programs to do it. At that time you drew it out in detail, on the mylar, and then you digitized it.
- we used CALMA for digitizing.
- Paper schematics, hand–drawing layout
- [how we really did logic simulation] Breadboards
- That was the early 1990s, and our 68060 was the first device that went into three of the gigantic Quick Time FPGA boxes, strapped together with big cables.},
  timestamp = {2019-06-09},
}

@WWW{,
  author    = {John McMillan},
  editor    = {Mentor, a Siemens Business},
  title     = {PCB design then and now},
  date      = {2015-07-14},
  url       = {https://blogs.mentor.com/jimmartens/blog/2015/07/14/pcb-design-then-and-now/},
  keywords  = {pcb, cad, calma, digitizing},
  review    = {- talks about Calma Systems, a CAD system in then 70-80's
- a separate computer room complete with a raised floor hiding all the AC ducts and cabling housed all the data storage systems that served and backed-up the workstations.
- there was also a huge HP plotter used to print out each layer of digitized routes that would eventually be scaled and photographed on to clear film in the lab located across the hall.
- the industry has come a long, long way from hand-taping designs and digitizing hand-drawn layers as done with the Calma systems that we literally usedto build rooms around.},
  timestamp = {2019-06-09},
}

@WWW{,
  author    = {David E. Weisberg},
  title     = {The Engineering Design Revolution},
  year      = {2008},
  url       = {http://www.cadhistory.net/11%20CALMA.pdf},
  keywords  = {cad, engineering, calma},
  timestamp = {2019-06-09},
}

@WWW{,
  editor    = {Museum Waalsdorp},
  title     = {Computers for electronic and mechanical engineering},
  year      = {2019},
  url       = {https://www.museumwaalsdorp.nl/en/history/comphistory/computer-history-the-period-1986-1989/comp866e/},
  keywords  = {calma, cad, engineering, pcb},
  timestamp = {2019-06-09},
}

@WWW{,
  editor    = {Ucamco, former Barco ETS},
  title     = {Cilbr8tor Series},
  year      = {2016},
  url       = {https://www.ucamco.com/en/hardware/photoplotters/calibr8tor/calibr8tor-series},
  keywords  = {photoplotter},
  review    = {- minimum line width: 5 µm
- max precision : 50800 ppi},
  timestamp = {2019-06-09},
}

@Book{,
  author    = {Mitchell Waite},
  title     = {Computer Graphics Primer},
  year      = {1979},
  editor    = {Howard W. Sams \& Co., Inc.},
  subtitle  = {At the cutting edge},
  url       = {https://www.atariarchives.org/cgp/Ch02_Sec25.php},
  abstract  = {Perhaps no single technology has had more impact on people than television. Yet according to the experts the real impact is just starting.

The reason? Home computers that connect to a standard television and convert it into a machine with more raw power than any product ever offered to the consumer and with the capability to completely alter the way we relate to the visual world of electronics.

This book is about one of the most exciting uses of the new home computer products—computer graphics—the ability to create complex drawings, plans, maps, and schematics on the screen of an ordinary black-and-white or color television. It is divided into three chapters. Chapter 1, “Perspectives,” presents what the entirely new field of home computer graphics is all about, explains how it got started, and illustrates some of the exciting applications for low-cost graphics displays. Chapter 2, “Basic Concepts,” introduces the general hardware and software concepts behind computer graphics and continues by presenting a profile of the numerous products on the market today. A section on graphics accessories is also included.

Chapter 3, the meat of the book, is entitled “Graphics Programming.” It introduces the graphics features of the Apple II computer used for this book, and then goes on to describe these concepts: plotting simple equations; drawing lines and vectors; creation of simple geometric shapes (rectangles, triangles, polygons, circles) as well as gaming figures (small tanks, jets, cars, rackets, animals); mandalas and other computer art effects, including tunneling; shape shifting, random artwork; detailed drawings and the use of digitizing tables; and, finally, moving figure animation.

The first two chapters of the book can be read any time and will be of help in evaluating which personal computer to buy for graphics work. The third chapter can be studied whether or not you own a computer, but your understanding will certainly be enhanced if one is available to practice the examples on.

The author hopes that you find this journey into computer graphics exciting, comprehensive, and, most of all, enjoyable.},
  keywords  = {calma, engineering},
  review    = {- The price tag on the GDS-II is a healthy $250,000.
- So far Calma has sold over 20 GDS-II systems and has many more on order.},
  timestamp = {2019-06-09},
}

@Article{,
  author    = {Robert Sugarman},
  title     = {Does the country need a good $20 microprocessor?},
  journal   = {The Engineering Newspaper for the Electronics Industry},
  date      = {1975-08-25},
  url       = {https://www.commodore.ca/gallery/magazines/misc/mos_605x_team_eetimes_august_1975.pdf},
  abstract  = {MOS Technology MCS650X microprocessor designers gather around a 200X print of the CPU Rubylith, color-coded for debugging into metallization, polysilicon and diffusion layers.
In the background is a 1000X expansion of the internal 21X143 decode-ROM, which manager Chuck Peddle claims is a key factor in obtaining small chip size.},
  keywords  = {6502, rubylith, engineering, chuck peddle},
  review    = {- 6502 was hand-drawn
- the mask used hand-cut Rubylith.},
  timestamp = {2019-06-09},
}

@WWW{,
  author    = {Kaitlyn Franz},
  editor    = {Digilent Inc},
  title     = {History of the FPGA},
  date      = {2015-01-16},
  url       = {https://blog.digilentinc.com/history-of-the-fpga/},
  keywords  = {fpga, timeline, technology},
  review    = {- 1960 first MOSFET
- 1961 first communication IC
- 1962 first TTL
- 1963 first CMOS
- 1965 Moore’s law
- 1970 PROM
- 1971 EPROM
- 1972 DST
- 1975 PLA (Programmable Logic Array)
- 1978 PAL (Programmable Array Logic)
- 1983 EEPROM
- 1983 GAL (Generic Array Logic)
- 1984 FLASH (~EEPROM)
- 1985 first FPGA},
  timestamp = {2019-06-10},
}

@WWW{,
  editor    = {Hardwarebee},
  title     = {Field Programmable Gate Array (FPGA) History and Applications},
  date      = {2018-02-23},
  url       = {http://hardwarebee.com/field-programmable-gate-array-fpga-history-applications/},
  keywords  = {altera, fpga, history},
  review    = {- The first reprogrammable logic device was created in 1984 by a company called Altera.
- It was the EP300 and offered a window that let an ultra-violet light onto EPROM cells, so they could be erased},
  timestamp = {2019-06-10},
}

@WWW{,
  author    = {Daniel Nenni},
  title     = {A Brief History of FPGAs},
  date      = {2012-08-26},
  url       = {https://semiwiki.com/fpga/1596-a-brief-history-of-fpgas/},
  keywords  = {xilinx, fpga, history},
  review    = {- In the 80's semiconductors cost millions of dollars to design and manufacture
- FPGAs also dramatically reduced time to market for electronic products.
- Ross Freeman worked at Zilog before
- Ross Freeman created Xilinx in 1984
- Seiko started manufacturing the first FPGAs for Xilinx in 1985 using a very mature 1.2 micron process.
- the first Xilinx FPGA was a 1000 ASIC gate equivalent running at 18MHZ.},
  timestamp = {2019-06-10},
}

@Electronic{,
  editor    = {Computer History Museum},
  title     = {Altera EP300 Design \& Development Oral History Panel},
  date      = {2009-10-20},
  url       = {https://archive.computerhistory.org/resources/access/text/2012/10/102702147-05-01-acc.pdf},
  keywords  = {altera, ep300, interview, Source III},
  timestamp = {2019-06-10},
}

@Electronic{,
  editor = {Gould Electronics},
  title  = {Electrically Erasable Programmable Logic PEEL 18CV8},
  date   = {2013-07-14},
  url    = {https://www.datasheetarchive.com/pdf/download.php?id=3ae6b7f4f1c26b281f249beac3c15d411ba916&type=O},
}

@WWW{,
  author    = {John Culver},
  editor    = {The CPU Shack},
  title     = {How a CPU Microprocessor is made},
  date      = {2011-04-20},
  url       = {http://www.cpushack.com/MakingWafers.html},
  keywords  = {wafer, silicon, fabrication},
  review    = {- wafer is mainly made of silicon with electrically active elements such as arsenic, boron, phosphorous or antimony.},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Dylan McGrath},
  editor    = {Electronic Engineering Times},
  title     = {FPGA startups stare down giants and ghosts},
  date      = {2009-07-27},
  url       = {https://www.eetimes.com/document.asp?doc_id=1263547},
  keywords  = {timeline, fpga, vendors, xilinx, altera},
  review    = {- Timeline of programmable logic vendors
- Because programmable logic vendors have traditionally provided software design tools to users at very low or no cost, the price of entry in this market includes not just silicon R&D, but software R&D as well.
- Market watcher Gartner Inc. estimates that Xilinx Inc. and Altera Corp. together accounted for nearly 87 percent of the programmable logic market in 2008
- Analysts like Lewis and Rich Wawrzyniak of Semico Research Corp. attribute the rise in programmable logic startup activity at least partially to the expiration of several key patents once held by the established players.
- Patents considered critical to the birth of FPGAs began expiring a few years ago. They include the original FPGA patent (U.S. Patent No. 4,870,302), issued to Xilinx co-founder Ross Freeman in 1988.
- Xilinx has more software engineers than hardware engineers; at Altera, the mix is roughly 50-50.},
  timestamp = {2019-06-11},
}

@Patent{,
  author    = {Ross H. Freeman},
  title     = {Configurable electrical circuit having configurable logic elements and configurable interconnects},
  number    = {US4870302A},
  date      = {1989-09-26},
  holder    = {Xilinx Inc},
  type      = {patentus},
  url       = {https://patents.google.com/patent/US4870302A/},
  abstract  = {A configurable logic array comprises a plurality of configurable logic elements variably interconnected in response to control signals to perform a selected logic function. Each configurable logic element in the array is in itself capable of performing any one of a plurality of logic functions depending upon the control information placed in the configurable logic element. Each configurable logic element can have its function varied even after it is installed in a system by changing the control information placed in that element. Structure is provided for storing control information and providing access to the stored control information to allow each configurable logic element to be properly configured prior to the initiation of operation of the system of which the array is a part. Novel interconnection structures are provided to facilitate the configuring of each logic element.},
  keywords  = {fpga, xilinx},
  review    = {- 1988-02-19, application filed by Xilinx Inc
- 2006-09-26, anticipated expiration},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Brandon HKallaher},
  editor    = {Digilent Blog},
  title     = {PAL vs. CPLD vs. FPGA},
  date      = {2016-08-10},
  url       = {https://blog.digilentinc.com/pal-vs-cpld-vs-fpga/},
  keywords  = {pal, cpld, fpga, comparison, use case},
  review    = {- PALs are made using two building blocks: a logic plane and output logic cells.
- PALs generally have around 20 I/O pins
- The main advantage of a CPLD (Complex Programmable Logic Device) over a PAL is the larger number of available gates and I/O pins.
- A typical use case for a CPLD is to configure an FPGA upon boot.
- Field Programmable Gate Arrays (FPGAs) are completely reconfigurable devices that have gate counts in the millions and hundreds of I/O pins.
- [FPGA] allow for highly complex designs, such as processors, to be created and tested.},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Olivier Carton},
  editor    = {Institut de Recherche en Informatique Fondamentale},
  title     = {Transistors et portes logiques},
  date      = {2006-09-12},
  url       = {https://www.irif.fr/~carton/Enseignement/Architecture/Cours/Gates/},
  keywords  = {porte, gate, transistor, not, nand, nor, xor, cmos},
  review    = {- La porte not peut être réalisée en logique CMOS par un circuit constitué de 2 transistors
- [La porte nand peut être réalisée en logique CMOS par un circuit] constitué de 4 transistors dont 2 n-MOS et 2 p-MOS.
- [La porte nor peut être réalisée en logique CMOS par un circuit] constitué de 4 transistors dont 2 n-MOS et 2 p-MOS.
- La porte and peut être réalisée en logique CMOS par un circuit constitué de 6 transistors dont 3 n-MOS et 3 p-MOS.
- La porte xor peut être réalisée en logique CMOS par un circuit constitué de 14 transistors dont 7 n-MOS et 7 p-MOS.},
  timestamp = {2019-06-11},
}

@Electronic{,
  editor    = {Altera, Intel},
  title     = {Cyclone V Device Handbook: Volume 1: Device Interfaces and Integration, Logic Array Blocks and Adaptive Logic Modules in Cyclone V Devices},
  date      = {2019-05-15},
  url       = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/cyclone-v/cv_5v2.pdf},
  keywords  = {altera, intel, cyclone v},
  timestamp = {2019-06-11},
}

@WWW{,
  editor   = {Intel},
  title    = {Intel completes acquisition of Altera},
  date     = {2015-12-28},
  url      = {https://newsroom.intel.com/news-releases/intel-completes-acquisition-of-altera/},
  abstract = {Intel Corporation (“Intel”) today announced that it has completed the acquisition of Altera Corporation (“Altera”), a leading provider of field-programmable gate array (FPGA) technology. The acquisition complements Intel’s leading-edge product portfolio and enables new classes of products in the high-growth data center and Internet of Things (IoT) market segments.},
}

@WWW{,
  author    = {Stacey Higginbotham},
  editor    = {Fortune},
  title     = {Why Intel will spend $16.7 billion on Altera},
  date      = {2015-08-27},
  url       = {http://fortune.com/2015/08/27/why-intel-altera/},
  keywords  = {acquisition, intel, altera},
  review    = {- Three months ago [may 2015] Intel said it would buy chip maker Altera in a deal valued at $16.7 billion
- Jason Waxman, the VP & GM of the cloud platforms group at Intel, […] said that by 2020 Intel believes a third of the data center market could be using the type of chips that Altera specializes in.},
  timestamp = {2019-06-11},
}

@WWW{,
  editor    = {BitFusion},
  title     = {BitFusion, the elastic AI infrastructure for multi-cloud},
  date      = {2019-06-11},
  url       = {https://bitfusion.io/},
  keywords  = {gpu, fpga, asic, ai, performance},
  review    = {- Virtual remote attached GPUs, FPGAs and ASICs for any AI application
- 2x performance boost and 2-4x cost reduction for your AI training and inference deployment},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Stacey Higginbotham},
  editor    = {GigaOM},
  title     = {Why Microsoft is building programmable chips that specialize in search},
  date      = {2014-06-16},
  url       = {https://gigaom.com/2014/06/16/why-microsoft-is-building-programmable-chips-that-specialize-in-search/},
  keywords  = {microsoft, fpga, cpu, comparison},
  review    = {- According to Doug Burger, the Microsoft Research employee quoted in the Wired story, the the FPGAs are 40 times faster than a generic Xeon CPU when it comes to running Microsoft’s algorithms.
- That explains why Microsoft, and other webscale giants from Amazon to Google are investigating different chip architectures for their servers.
- And Microsoft’s decision to test FPGAs is doubly interesting because they can actually be re-programmed when the company’s algorithms change, making them a costly, but flexible option. And if there’s one thing we know about the cloud, it’s that flexibility trumps cost.},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Derrick Harris},
  editor    = {GigaOM},
  title     = {Microsoft is building fast, low-power neural networks with FPGAs},
  date      = {2015-02-23},
  url       = {https://gigaom.com/2015/02/23/microsoft-is-building-fast-low-power-neural-networks-with-fpgas/},
  keywords  = {microsoft, ai, fpga, neural network},
  timestamp = {2019-06-11},
}

@Electronic{,
  author    = {Kalin Ovtcharov, Olatunji Ruwase, Joo-Young Kim, Jeremy Fowers, Karin Strauss, Eric S. Chung},
  editor    = {Microsoft Research},
  title     = {Accelerating deep convolutional neural networks using specialized hardware},
  date      = {2015-02-22},
  url       = {https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CNN20Whitepaper.pdf},
  abstract  = {Recent breakthroughsin the development of multi-layer convolutional neural networkshave led to state-of-the-art improvements in the accuracy of non-trivial recognition tasks such as large-category image classificationand automaticspeech recognition.These many-layered neural networks are large, complex,and require substantial computingresourcesto train and evaluate. Unfortunately, these demands come at an inopportunemoment due to the recent slowing of gains in commodity processor performance.Hardware specializationin the form of GPGPUs, FPGAs, and ASICs offers a promising path towards major leaps in processing capabilitywhile achieving high energy efficiency. To harness specialization, an effort is underwayat Microsoft to accelerate Deep Convolutional Neural Networks (CNN) usingservers augmented with FPGAs—similar to the hardware that is being integrated intosome of Microsoft’s datacenters. Initial efforts to implement a single-node CNN accelerator on a mid-rangeFPGA showsignificant promise, resulting in respectableperformance relative to prior FPGA designs and high-end GPGPUs, at a fraction of the power. In the future, combining multiple FPGAs over a low-latency communication fabric offers further opportunity to train and evaluate models of unprecedented size and quality},
  keywords  = {microsoft, neural network, ai, fpga},
  review    = {- 233 images/sec for 25 W for FPGA Arria 10 [9.32 images/sec/watt] vs 824 images/sec for 235 W for Tesla K40 [3.5 images/sec/watt] on ImageNet 1K},
  timestamp = {2019-06-11},
}

@Electronic{,
  author    = {Gordon Earle Moore},
  editor    = {Fairchild Semiconductor},
  title     = {Cramming more components onto integrated circuits},
  date      = {1965-04-19},
  url       = {https://newsroom.intel.com/wp-content/uploads/sites/11/2018/05/moores-law-electronics.pdf},
  abstract  = {With unit cost falling as the number of components percircuit rises, by 1975 economics may dictate squeezing asmany as 65,000 components on a single silicon chip.},
  keywords  = {moore's law, integrated circuit, fairchild semiconductor},
  review    = {- En 1965, Moore extrapolait que le nombre de composants d’un circuit intégré doublerait chaque année pendant au moins 10 ans pour atteindre 65000 en 1975
- Son extrapolation ne parlait pas des décennies suivantes
- Computers will be more powerful, and will be organized in completely different ways.},
  timestamp = {2019-06-11},
}

@Electronic{,
  author = {Gordon Earle Moore},
  editor = {Intel},
  title  = {Progress in digital integrated electronics},
  year   = {1975},
  url    = {https://www.eng.auburn.edu/~agrawvd/COURSE/E7770_Spr07/READ/Gordon_Moore_1975_Speech.pdf},
  review = {- The new slope might approximate a doubling every two years, rather than every year, by the end of the decade.},
}

@Electronic{,
  editor    = {UNIT (Université Numérique Ingénierie et Technologie)},
  title     = {"Les grands mythes fondateurs" des nanos : la loi de Moore ou l'héritage du talk de Feynman de 1959},
  date      = {2015-06-21},
  url       = {http://www.unit.eu/cours/enjeux-nanosciences-nanotechnologies/Module3-FR.pdf},
  abstract  = {Précisions sur les prévisions de Gordon Earle Moore et la fameuse loi de Moore.},
  keywords  = {myth, moore's law, richard feynman, nanotechnology},
  review    = {Précisions sur les prévisions de Gordon Earle Moore et la fameuse loi de Moore.},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Lynnette Reese},
  editor    = {Embedded Intel Solutions},
  title     = {Comparing hardware for artificial intelligence: FPGAs vs. GPUs vs. ASICs},
  date      = {2018-07-24},
  url       = {https://eecatalog.com/intel/2018/07/24/comparing-hardware-for-artificial-intelligence-fpgas-vs-gpus-vs-asics/},
  keywords  = {fpga, asic, gpu, cpu, comparison},
  review    = {- FPGAs [and ASICs] offer lower latency than GPUs or CPUs which is better for applications that require real-time AI.
- Another area where FPGAs outperform GPUs (and CPUs) is for those applications with a constrained power envelope.
- FPGAs are similar to ASICs except that FPGAs are notoriously difficult to program and ASICs have a typical production cycle time of 12 – 18 months
- Both GPUs and FPGAs can process in parallel on a massive scale. However, FPGAs also surpass GPUs for efficiency in parallel processing
- Although historically complex to program, FPGAs are carving out their own space in AI technology, with new tools that make programming AI applications that much easier.},
  timestamp = {2019-06-11},
}

@Electronic{,
  editor    = {Arrow},
  title     = {FPGA vs CPU vs GPU vs Microcontroller},
  date      = {2018-10-02},
  url       = {https://static4.arrow.com/-/media/images/research-and-events/articles/1018
/arrow_fpgavscpuvsgpuvsmicrocontroller.pdf},
  abstract  = {Tableaux comparatifs des points forts des différentes technologies},
  keywords  = {fpga, cpu, gpu, comparison, asic},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {David Feugey},
  title     = {Altera mise sur l’OpenCL pour révolutionner le monde des FPGA},
  date      = {2011-11-16},
  url       = {https://www.silicon.fr/altera-mise-sur-l%E2%80%99opencl-pour-revolutionner-le-monde-des-fpga-65255.html},
  keywords  = {opencl, cuda, gpu, fpga, altera},
  review    = {- il y a cinq ans [en 2006], NVIDIA lançait CUDA, une technologie permettant d’exploiter simplement (relativement) la puissance des GPU intégrés aux cartes graphiques dans le cadre de calculs massivement parallèles
- Le support de l’OpenCL au sein des FPGA d’Altera n’en est qu’à ses prémices},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Damien Dubuc},
  editor    = {Aneo},
  title     = {Afin de terminer notre série de billets, voici quelques réflexions et perspectives que nous ressortons de l’étude},
  date      = {2018-02-06},
  url       = {https://www.aneo.eu/2018/02/06/perspectives-quant-a-lutilisation-fpga-chez-aneo-billet-8/},
  review    = {- Sur FPGA, il est coûteux d’itérer sur un nouveau design, et le grand nombre d’optimisations possibles (différant par leur nature et paramètres) pose une difficulté.},
  timestamp = {2019-06-11},
}

@Electronic{,
  editor    = {Altera},
  title     = {Implementing FPGA design with the OpenCL standard},
  date      = {2013-11-14},
  url       = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/wp/wp-01173-opencl.pdf},
  abstract  = {The initial era of programmable technologies contained two different extremes of programmability. As illustrated in Figure 1, one extreme was represented by single core CPU and digital signal processing (DSP) units. These devices were programmable using software consisting of a list of instructions to be executed. These instructions were created in a manner that was conceptually sequential to the programmer, although an advanced processor could reorder instructions to extract instruction-level parallelism from these sequential programs at run time. In contrast, the other extreme of programmable technology was represented by the FPGA. These devices are programmed by creating configurable hardware circuits, which execute completely in parallel. A designer using an FPGA is essentially creating a massively-fine-grained parallel application. For many years, these extremes coexisted with each type of programmability being applied to different application domains. However, recent trends in technology scaling have favored technologies that are both programmable and parallel.},
  keywords  = {opencl, altera, fpga},
  timestamp = {2019-06-11},
}

@Electronic{,
  author    = {Dmitry Denisenko},
  editor    = {Intel},
  title     = {OpenCL for FPGAs},
  date      = {2016-06-18},
  url       = {https://cpufpga.files.wordpress.com/2016/04/opencl_for_fpgas_isca_2016.pdf},
  keywords  = {opencl, fpga, altera, intel},
  review    = {- How OpenCL concepts map to FPGA architecture
- description of LUTs
- What is OpenCL?},
  timestamp = {2019-06-11},
}

@Electronic{,
  author    = {David Castells-Rufas},
  editor    = {Cephis},
  title     = {Workshop: programming FPGAs with OpenCL},
  date      = {2018-05-05},
  url       = {http://www.sie.es/wp-content/uploads/2018/06/FPGA-with-OpenCL.pdf},
  keywords  = {opencl, fpga, energy efficiency},
  review    = {- History of energy efficiency
- Dennard Scaling Rules
- Why FPGAs can be energy efficient?
- What FPGAs can provide? Remove intermediate memory from computation datapaths, allow much higher number simultaneous computation units, fine grain (bit level) computation
- Problems: lower frequency due to overheads, difficult to program (HDL)},
  timestamp = {2019-06-11},
}

@WWW{,
  author    = {Vincent Hindriksen},
  editor    = {StreamHPC},
  title     = {Why use OpenCL on FPGAs?},
  date      = {2014-09-16},
  url       = {https://streamhpc.com/blog/2014-09-16/use-opencl-fpgas/},
  abstract  = {Altera has just released the free ebook FPGAs for dummies. One part of the book is devoted to OpenCL, so we’ll quote some extracts here from one of the chapters. The rest of the book is worth a read, so if you want to check the rest of the text, just fill in the form on Altera’s webpage},
  keywords  = {altera, fpgas for dummies, opencl},
  review    = {- Today, OpenCL is developed and maintained by the technology consortium Khronos Group. Most FPGA manufacturers provide Software Development Kits (SDKs) for OpenCL development on FPGAs.
- Free ebook FPGAs for dummies},
  timestamp = {2019-06-11},
}

@Book{,
  author    = {Andrew Moore, Ron Wilson},
  title     = {FPGAs for Dummies},
  date      = {2017-01-09},
  editor    = {Intel},
  isbn      = {978-1-119-39049-7},
  url       = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/misc/fpgas_for_dummies_ebook.pdf},
  abstract  = {Field programmable gate arrays (FPGAs) are integrated cir-cuits that enable designers to program customized digital logic in the field. FPGAs have been around since the 1980s and were originally conceived to give all design teams the ability to create custom logic. In the early days, using an FPGA in your design meant you had to do a lot of programming just to get your FPGA to perform simple functions, so most design-ers avoided them. If you haven’t looked into FPGAs since your university studies way back when, you’ll want to take another look at them.The FPGA has evolved from a useful but humble interface device into a system-level integrated circuit (IC) with its own microprocessors, memory blocks, and interfaces. It’s the next big thing.Now would be a great time to get an inexpensive development kit, download free tools, and begin to explore this world for yourself. And this book will help you understand the practical uses of FPGAs.},
  keywords  = {fpga, intel, altera, dummies},
  review    = {- How FPGAs work
- The difference between FPGAs, ASSPs, and ASICs
- To use FPGAs as functional blocks in a system

- [FPGAs] enable you to build exactly the hardware you need
- [FPGAs can be customized meaning] that often […] you can do operations in a simpler, faster, more energy-efficient way.
- An FPGA is a semiconductor device on which the function can be defined after manufacturing
- "field programmable = programmable in the field", programmable sur le terrain, reconfigurable alors que le circuit intégré est installé sur un circuit imprimé
- constraints : cost, performance, power consumption, size, interface…
- OpenCL was first developped by Apple. Today, OpenCL is developed and maintained by the technology consortium Khronos Group.
- FPGAs are inherently parallel
- Single-device motor control: microcontrollers can fall short of the performance demands of sophisticated motor-control algorithms such as direct torque control (DTC) or sensorless field oriented control (SFOC)
- Television broadcasting: television broadcasters use a serial digital interface (SDI) standard to transmit uncompressed digital video. The latest standard is calld the 3-Gbps SDI and is capable of moving 4K ultraHD signals. FPGA solutions come with a core transceiver that can function on all three SDI rates (SD SDI, HD SDI and 3G-SDI) on the same transceiver.
- Wireless data: many FPGAs now come equipped with buil-in low-latency intellectual property (IP) for advanced networks as well as productivity enhancing tools to allow manufacturers to leverage FPGAs advantages of performance, power, price and productivity to focus their efforts on product differentiation.
- Automotive driver assistance cameras : traditional DSP processors or microcontrollers don't have the power to do real-time video processing and analytics at the same time. Moreover, HDR or high dynamic range, is a necessity for video analytics to be accurate. HDR processing can as much as triple the demand for video signal processing power, taking the performance requirements out of reach for all but the most expensive DSPs.
- High-performance computing: developping application-specific coprocessors. Latest Intel FPGAs build in not just DSP functions but floating-point hardware.},
  timestamp = {2019-06-11},
}

@Electronic{,
  author    = {Wayne Wolf, Ahmed Amine Jerraya and Grant Martin},
  editor    = {IEEE},
  title     = {Multiprocessor System-on-Chip (MPSoC) Technology},
  date      = {2008-10-10},
  url       = {http://www.cs.unc.edu/~montek/teaching/Comp790-Fall11/Home/Home_files/2008Wolf.pdf},
  abstract  = {The  multiprocessor  system-on-chip  (MPSoC)  usesmultiple  CPUs  along  with  other  hardware  subsystems  to  imple-ment a system. A wide range of MPSoC architectures have beendeveloped over the past decade. This paper surveys the history ofMPSoCs to argue that they represent an important and distinctcategory of computer architecture. We consider some of the tech-nological trends that have driven the design of MPSoCs. We alsosurvey computer-aided design problems relevant to the design ofMPSoCs},
  keywords  = {Configurable processors, encoding, hardware/software codesign, multiprocessor, multiprocessor system-on-chip(MPSoC)},
  review    = {- [MPSoCs]  are  not  simply  traditional  multiprocessors shrunk  to  a  single  chip  but  have  been  designed  to  fulfill  theunique requirements of embedded applications
- MPSoCs have been in production for much longer than multicore processors.
- multicore <> MPSoCs in that they simply combine many same processors on a chip while MPSoCs combine heterogeneous processors with different requirements},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Zoltan Baruch, Octavian Creţ and Kalman Pusztai},
  editor    = {Technical University of Cluj-Napoca},
  title     = {Configurable processor},
  date      = {2002-05-25},
  url       = {https://www.researchgate.net/publication/229001961_CONFIGURABLE_PROCESSOR},
  abstract  = {Configurable architectures can deliver the high performance required by
computationally-demanding applications, similar to the ASIC circuits, while providing the
flexibility of the programmable processors. The performances achieved by these architectures
are often one or two orders of magnitude higher than those of processor-based alternatives. In
this paper we describe the design and implementation of a configurable processor. The proces-
sor consists of a constant part and a configurable structure. The constant part allows to solve
simple applications without changing the existing resources. The configurable part is defined by
the user, based on the requirements of a specific application. This part contains application-
specific functional blocks, controlled by special instructions. The integration of a classical proc-
essor and a configurable architecture within the same circuit allows to exploit the advantages of
both architectures. For the design of the configurable processor we used the VHDL language.
The implementation was performed using a Xilinx XCV600E FPGA device. This processor can
be used in several types of applications: data encryption and compression, image processing,
digital signal processing, special arithmetic.},
  keywords  = {Configurable computing, Configurable architectures, Reconfigurable devices, FPGA devices.},
  review    = {- The processor contains the kernel of a general-purpose processor and it can be extended in order to be used for specific application. This extension is accomplished by adding new instructions and functional units, without the change of the general-purpose kernel
- In traditional processors, operations are composed temporally by sequencing them in time, using registers or memory to store intermediate results.
- In configurable architectures, tasks are implemented by spatially composing primitive operators, that is, by linking them together by wires.
- FPGA devices can control operations at bit level, while processors can control their operators only at word level. As a result, processors often waste part of their computational capacity when operating on narrow-width data.},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Ricardo E. Gonzalez},
  editor    = {Stretch Inc},
  title     = {A Software Configurable Processor},
  date      = {2005-09-29},
  url       = {https://pdfs.semanticscholar.org/89ad/640a6e704844a0098278e79a8ef06a934c62.pdf},
  keywords  = {assp, asic, fpga, gpu, dsp, cpu, configurable computing},
  review    = {- Highest compute performance: ASSP/ASIC > FPGA/GPU > DSP > CPU
- Lowest system time-to-market: CPU/ASSP > DSP > FPGA > ASIC
- Software solution: ASSP, CPU, DSP, GPU
- Hardware solution: FPGA, ASIC
- Compute intensive markets & applications:
 - military & security (sonar/radar, biometrics)
 - medical electronics (CAT scan, ultrasound)
 - office automation (printers, scanners)
 - networking (encryption/decryption, network security)
 - video & audio (broadcast equipment, audio studio)
- EEMBC, the Embedded Microprocessor Benchmark Consortium, develops and certifies real-world benchmarks and benchmark scores to help designers select the right embedded processors for their systems
- ISEF: Instruction Set Extension Fabric},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Warren Miller},
  editor    = {Electronic Engineering Times},
  title     = {Configurable processors as an alternative to FPGAs},
  date      = {2013-07-03},
  url       = {https://www.eetimes.com/author.asp?section_id=36&doc_id=1318804},
  abstract  = {An exploration of using configurable processors as an alternative to the traditional FPGA approach to creating a custom system.},
  keywords  = {configurable processor, fpga, isef},
  review    = {- Configurable processors can implement many compute oriented functions FPGAs can address but with some distinct advantages.
- Stretch is a microprocessor company that puts programmable fabric inside the processor.
- FPGA companies [puts] processors on FPGAs
- The ISEF is not made up of the familiar Look-Up-Tables (LUTs) seen in most FPGAs.
- [The ISEF] is a vast collection of ALUs, shift registers, and similar compute fabric elements.
- Ability for "pure" software designers to access configurable technology.
- Algorithms that are commonly implemented in the fabric can be moved into the dedicated programmable accelerator on the next generation device.
- FPGA implementations […] target much wider applications sets and must by their nature stay more generic.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {IBM Research Editorial Staff},
  editor    = {IBM Research Blog},
  title     = {IBM Scientists Demonstrate Mixed-Precision In-Memory Computing for the First Time; Hybrid Design for AI Hardware},
  date      = {2018-04-17},
  url       = {https://www.ibm.com/blogs/research/2018/04/ibm-scientists-demonstrate-mixed-precision-in-memory-computing-for-the-first-time-hybrid-design-for-ai-hardware/},
  keywords  = {in-memory computing, hybrid, ibm},
  review    = {- The fundamental design of today’s computers, which are based on the von Neumann architecture, [requires] data to be shuttled back and forth at high speeds, an inefficient process.
- The clear answer is to transition to a non-von Neumann architecture in which memory and processing coexist in some form.
- in-memory computing [uses] nanoscale resistive memory devices, organized in a computational memory unit, for both processing and memory.
- called mixed-precision in-memory computing […] combines a von Neumann machine with a computational memory unit
- In this hybrid design, the computational memory unit performs the bulk of the computational tasks, whereas the von Neumann machine implements a method to iteratively improve or refine the accuracy of the solution},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {IBM Research Editorial Staff},
  editor    = {IBM Research Blog},
  title     = {IBM Scientifs Demonstrate In-memory Computing with 1 Million Devices for Applications in AI},
  date      = {2017-10-24},
  url       = {https://www.ibm.com/blogs/research/2017/10/ibm-scientists-demonstrate-memory-computing-1-million-devices-applications-ai/},
  keywords  = {in-memory computing, computational memory, ibm, phase change memory},
  review    = {- “In-memory computing” or “computational memory” is an emerging concept that uses the physical properties of memory devices for both storing and processing information.
- Phase Change Memory},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Abu Sebastian, Tomas Tuma, Nikolaos Papandreou, Manuel Le Gallo, Lukas Kull, Thomas Parnell \& Evangelos Eleftheriou},
  editor    = {Nature communications},
  title     = {Temporal correlation detection using computational phase-change memory},
  date      = {2017-10-24},
  url       = {https://www.nature.com/articles/s41467-017-01481-9},
  abstract  = {Conventional computers based on the von Neumann architecture perform computation by repeatedly transferring data between their physically separated processing and memory units. As computation becomes increasingly data centric and the scalability limits in terms of performance and power are being reached, alternative computing paradigms with collocated computation and storage are actively being sought. A fascinating such approach is that of computational memory where the physics of nanoscale memory devices are used to perform certain computational tasks within the memory unit in a non-von Neumann manner. We present an experimental demonstration using one million phase change memory devices organized to perform a high-level computational primitive by exploiting the crystallization dynamics. Its result is imprinted in the conductance states of the memory devices. The results of using such a computational memory for processing real-world data sets show that this co-existence of computation and storage at the nanometer scale could enable ultra-dense, low-power, and massively-parallel computing systems.},
  keywords  = {phase-change memory},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Scott Thornton},
  editor    = {Microcontroller Tips},
  title     = {What's the difference between Von-Neumann and Harvard architectures?},
  date      = {2018-03-08},
  url       = {https://www.microcontrollertips.com/difference-between-von-neumann-and-harvard-architectures/},
  keywords  = {architecture, von neumann, harvard, bottleneck},
  review    = {- In a Von-Neumann architecture, the same memory and bus are used to store both data and instructions
- The Harvard architecture stores machine instructions and data in separate memory units that are connected by different busses.
- Modern processors might share memory but have mechanisms like special instructions that keep data from being mistaken for code: “modified Harvard architecture.”. The memory controller is where the modification is seated, since it handles the memory and how it is used.
- A typical computation with a Von-Neumann architecture is:
 - read code
 - read memory
 - compute
 - write memory
- The Von Neumann bottleneck occurs when data taken in or out of memory must wait while the current memory operation is completed
- The Von Neumann bottleneck has increased over time because processors have improved in speed while memory has not progressed as fast.
- It is interesting to note that speculative execution is the conduit for one of the latest security flaws discovered by Google Project Zero, named Spectre.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {ARM Technical Support Knowledge Articles},
  editor    = {ARM Limited},
  title     = {What is the difference between a von Neumann architecture and a Harvard architecture?},
  date      = {2008-09-09},
  url       = {http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka3839.html},
  keywords  = {von neumann, harvard, architecture},
  review    = {- Using a simple, unified memory system together with a Harvard architecture is highly inefficient.
- Unless it is possible to feed data into both busses at the same time, it might be better to use a von Neumann architecture processor.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {VAndrei},
  editor    = {StackOverflow},
  title     = {Von Neumann vs Harvard architecture},
  date      = {2014-11-09},
  url       = {https://stackoverflow.com/questions/26826248/von-neumann-vs-harvard-architecture},
  keywords  = {von neumann, harvard, architecture},
  review    = {- Why isn't a pure Harvard architecture adopted for PC's? L1 Instruction Cache miss ratio is very small. This means that generally code size is not a problem. So it wouldn't make sense to design a fully separate path for code. Data can grow very large but code can't really.
- In DSP's it makes sense to use separate code and data paths. That's because DSP's work mainly on "streaming data" meaning that the need for caching is rather small. Also the DSP codes can contain pre-computed coefficients that increase the code size. So there is a balance between data size and code size, meaning that it makes sense using a Harvard architecture.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Victor Eijkhout},
  editor    = {Quora},
  title     = {Are there alternatives to the Von Neumann architecture?},
  date      = {2016-01-10},
  url       = {https://www.quora.com/Are-there-alternatives-to-the-Von-Neumann-architecture},
  keywords  = {alternative, von neumann, dataflow},
  review    = {- The Von Neumann architecture is focused on instructions
- It makes the assumption that data can be found and created faster than the instruction execution rate. This used to be true at some point, when data could be transferred in one cycle and processors took several cycles for an instruction.
- This has not been the case for 2 decades or so. As a result processors are now reasoning about instructions.
- Dataflow has plenty of interesting ideas, but the notion that a compiler could do dataflow analysis on a regular programming language has proved to be a complete chimera.
- explicit dataflow programming is very hard
- Dataflow is used inside your CPU, completely realized in hardware; it is the basis for out-of-order execution, and is used as a mechanism to hide latency},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Matthias Fouquet-Lapar},
  editor    = {SGI},
  title     = {The von Neumann Architecture and Alternatives},
  date      = {2008-05-16},
  url       = {https://wwz.ifremer.fr/pcdm/content/download/29481/407627/file/mfl.pdf},
  keywords  = {von neumann, bottleneck, xpp, hpc, fpga, gpu, ibm cell},
  review    = {- Physical semi-conductor limits
 - clock frequency
 - heat
 - etching below 10 nm
- Von Neumann bottleneck: cache memory is used to hide memory latencies, hyperthreading allows additional hiding of memory references by switching to a different thread
- Caches occupy more and more places on the die
- Additional transistors can be used to add cores on the same die
- The ONLY problem is your application/workflow because it has to be parallelized
- Maybe you also need higher performance, better silicon utilization, less power consumption, better price/performance
- The way and degree of parallelization is application specific (one solution cannot fit them all, solution is hybrid)
- 1998, Tensor Processing Unit (TPU): hybrid computational model, adaptive in execution, library-based programming model
- TPU is deeply pipelined and heavily chained vector operations
- TPU, some datapaths are reconfigurable
- TPU for radar, medical imaging (CT), image processing (compression, decompression, filtering)
- technologies for hybrid acceleration: FPGA, GPU, ClearSpeed, PACT XPP, multi-core CPUs, IBM Cell
- PACT XPP, from von-Neumann instruction to configuration/data flow
- Not every Accelerator Technology is applicable to every HPC Problem – and it’s a moving target
- Not every HPC Application can be accelerated 
- a solution has to be price/performance competitive.
- There should be at least a 10-20X speedup compared to current top-end CPUs},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Various persons},
  editor    = {StackOverflow},
  title     = {What are some examples of non-Von Neumann architectures?},
  date      = {2012-11-12},
  url       = {https://stackoverflow.com/questions/1806490/what-are-some-examples-of-non-von-neumann-architectures},
  keywords  = {von neumann, architecture, harvard, dataflow, reduction, cellular automata, quantum computing},
  review    = {- Harvard architecture
- Modified Harvard architecture
- Dataflow machines
- Reduction machines
- Cellular automata
- Quantum computing / Quantum Turing machine
- Problem dependent machines},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {E. Schüler, Tim Helfers},
  editor    = {PACT, Astrium},
  title     = {XPP - eXtreme Processing Platform Technology for space applications},
  date      = {2001-09-20},
  url       = {http://spacewire.esa.int/WG/Data-Systems/OPDP-proceedings/Presentations/session%20II.B/3_Astrium_XPP_Helfers.pdf},
  keywords  = {xpp, architecture, von neumann, fpga},
  review    = {- XPP reconfigurable processing cores
- conventional processors use the sequential model, each operation takes one clock cycle, multiple operations are computed consecutively
- Multiple computations are mapped as code sections onto a two dimensional array (flow graph mapping)
- The code sections are mapped directly onto the processing array
- Von Neumann architecture = instructions
- XPP architecture = configuration
- Other technologies: VLIW architectures, Multi DSP cores, Reconfigurable processors, FPGA
- comparison between multi-DSP cores, processor applications in FPGAs and reconfigurable coprocessors},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Katherine Compton, Scott Hauck},
  editor    = {ACM Computing Surveys},
  title     = {Reconfigurable computing: a survey of systems and software},
  date      = {2002-05-14},
  url       = {https://people.ece.uw.edu/hauck/publications/ConfigCompute.pdf},
  abstract  = {Due to its potential to greatly accelerate a wide variety of applications, reconfigurable computing has become a subject of a great deal of research. Its key feature is the ability to perform computations in hardware to increase performance, while retaining much of the flexibility of a software solution. In this survey, we explore the hardware aspects of reconfigurable computing machines, from single chip architectures to multi-chip systems, including internal structures and external coupling. We also focus on the software that targets these machines, such as compilation tools that map high-leve lalgorithms directly to the reconfigurable substrate. Finally, we consider the issues involved in run-time reconfigurable systems, which reuse the configurable hardware during program execution},
  keywords  = {reconfigurable computing, fpga, architecture},
  review    = {- Reconfigurable computing as a concept has been in existence for quite some time [Estrin et al. 1963]
- The recent advances in reconfigurable computing are for the most part derived from the technologies developed for FPGAs in the mid-1980s
- FPGAs were originally created to serve as a hybrid device between PALs and Mask-Programmable Gate Arrays (MPGAs).
- The flexibility, capacity,and performance of [FPGAs] has opened up completely new avenues in high-performance computation, forming the basis of reconfigurable computing.
- when the percentage of logic blocks used in an FPGA becomes very high, automatic routing tools frequently have difficulty achieving the necessary connections between the blocks
- Programmable logic tends to be inefficientat implementing certain types of operations, such as variable-length loops andbranch control.
- a reconfigurable unit may be used
* to provide reconfigurable functional units within a host processor,
* as a coprocessor
* as an attached reconfigurable processing unit
* as an external stand-alone processing unit
- Since the introduction of FPGAs in the mid-1980s, there have been many different investigations into what computation element(s) should be built into the array
- These structures, commonly called logic blocks or cells, vary in complexity
- Because multiplication is one of themore difficult computations to implement efficiently in a traditional FPGA structure, the custom multiplication hardware embedded within a reconfigurable arrayallows a system to perform even that function well
- Altera has demonstrated a preliminary ARM9-based Excalibur device, which combines reconfigurable hardware with an embedded ARM9 processor core [Altera 2001]
- for FPGAs, high-LUT utilization may not necessarily be the most desirable situation, but rather efficient routing usage may be of more importance. This is because the routing resources occupy a much larger part of the area of an FPGA than the logic resources, and therefore the most area efficient designs will be those that optimize their use of the routing resources rather than the logic resources.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Kaz Sato, Cliff Young, David Patterson},
  editor    = {Google},
  title     = {An in-depth look at Google’s first Tensor Processing Unit (TPU)},
  date      = {2017-05-12},
  url       = {https://cloud.google.com/blog/products/gcp/an-in-depth-look-at-googles-first-tensor-processing-unit-tpu},
  abstract  = {There’s a common thread that connects Google services such as Google Search, Street View, Google Photos and Google Translate: they all use Google’s Tensor Processing Unit, or TPU, to accelerate their neural network computations behind the scenes.},
  keywords  = {tpu, tensor processing unit, google, architecture, reconfigurable computing},
  review    = {- In short, we found that the TPU delivered 15–30X higher performance and 30–80X higher performance-per-watt than contemporary CPUs and GPUs
- Google considered building an Application-Specific Integrated Circuit (ASIC) for neural networks as early as 2006
- We chose to package the processor as an external accelerator card that fits into an SATA hard disk slot for drop-in installation.
- The TPU is connected to its host via a PCIe Gen3 x16 bus that provides 12.5GB/s of effective bandwidth.
- Quantization is a powerful tool for reducing the cost of neural network predictions
- Being able to use integer rather than floating point operations greatly reduces the hardware footprint and energy consumption of our TPU
- A TPU contains 65,536 8-bit integer multipliers
- [The TPU is] designed to be flexible enough to accelerate the computations needed to run many different kinds of neural network models.
- We chose the Complex Instruction Set Computer (CISC) style as the basis of the TPU instruction set instead
- The heart of the TPU: A systolic array
- CPUs and GPUs often spend energy to access multiple registers per operation. A systolic array chains multiple ALUs together, reusing the result of reading a single register.
- It makes an engineering tradeoff: limiting registers, control and operational flexibility in exchange for efficiency and much higher operation density.
- a TPU can process 65,536 multiply-and-adds for 8-bit integers every cycle. Because a TPU runs at 700MHz, a TPU can compute 65,536 × 700,000,000 = 46 × 1012 multiply-and-add operations or 92 Teraops per second (92 × 1012) in the matrix unit
- A TPU has none of the sophisticated microarchitectural features that consume transistors and energy to improve the average case but not the 99th-percentile case: no caches, branch prediction, out-of-order execution, multiprocessing, speculative prefetching, address coalescing, multithreading, context switching and so forth.
- Minimalism is a virtue of domain-specific processors.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Jeff Dean, Urs Hölzle},
  editor    = {Google},
  title     = {Build and train machine learning models on our new Google Cloud TPUs},
  date      = {2017-05-17},
  url       = {https://blog.google/products/google-cloud/google-cloud-offer-tpus-machine-learning/},
  keywords  = {tpu, tensor processing unit, google, reconfigurable computing, ai},
  review    = {- Each of these new TPU devices delivers up to 180 teraflops of floating-point performance
- A TPU pod contains 64 second-generation TPUs and provides up to 11.5 petaflops to accelerate the training of a single large machine learning model
- Shazam recently announced that they successfully migrated major portions of their music recognition workloads to NVIDIA GPUs on Google Cloud and saved money while gaining flexibility.},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Arthur H. Veen},
  editor    = {ACM Computing Surveys},
  title     = {Dataflow machine architecture},
  date      = {1986-12-31},
  url       = {https://www.researchgate.net/publication/220566271},
  abstract  = {Dataflow machines are programmable computers of which the hardware is optimized for fine-grain data-driven parallel computation. The principles and complications of data-driven execution are explained, as well as the advantages and costs of fine-grain parallelism. A general model for a dataflow machine is presented and the major design options are discussed.Most dataflow machines described in the literature are surveyed on the basis of this model and its associated technology. For general-purpose computing the most promising dataflow machines are those that employ packet-switching communication and support general recursion. Such a recursion mechanism requires an extremely fast mechanism to map a sparsely occupied virtual space to a physical space of realistic size. No solution has yet proved fully satisfactory.A working prototype of one processing element is described in detail. On the basis of experience with this prototype, some of the objections raised against the dataflow approach are discussed. It appears that the overhead due to fine-grain parallelism can be made acceptable by sophisticated compiling and employing special hardware for the storage of data structures. Many computing-intensive programs show sufficient parallelism. In fact, a major problem is to restrain parallelism when machine resources tend to get overloaded. Another issue that requires further investigation is the distribution of computation and data structures over the processing elements.},
  keywords  = {dataflow, data-driven computing, architecture},
  review    = {- data-driven parallel computing
- the efficiency of a parallel computer is influenced by several conflicting factors: contention (for a shared resource, usually shared memory or some other communication channel), scalability (property that the performance of the machine can always be improved by adding more processing elements)
- In dataflow machines scheduling is based on availability of data; this is called data-driven execution.},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {David H. Jones, Adam Powell, Christos-Savvas Bouganis, Peter Y. K. Cheung},
  editor    = {Imperial College London},
  title     = {GPU versus FPGA for high productivity computing},
  date      = {2010-08-03},
  url       = {http://cas.ee.ic.ac.uk/people/ccb98/papers/DavidFPL10.pdf},
  abstract  = {Heterogeneous or co-processor architectures arebecoming an important component of high productivity computing systems (HPCS). In this work the performance of a GPU based HPCS is compared with the performance of a commercially available FPGA based HPC. Contrary to previous approaches that focussed on specific examples, a broader analysis is performed by considering processes at an architectural level. A set of benchmarks is employed that use different process architectures in order to exploit the benefits of each technology. These include the asynchronous pipelines common to “map” tasks, a partially synchronous tree common to “reduce” tasks and a fully synchronous, fully connected mesh. We show that the GPU is more productive than the FPGA architecture for most of the benchmarks and conclude that FPGA-based HPCS is being marginalised by GPUs.},
  keywords  = {hpcs, high productivity computing system, architecture, gpu, fpga, comparison},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Ratheesh Kalarot and John Morris},
  editor    = {The University of Auckland / IEEE},
  title     = {Comparison of FPGA and GPU implementations of Real-time Stereo Vision},
  date      = {2010-05-19},
  url       = {https://www.researchgate.net/profile/John_Morris25/publication/224165460_Comparison_of_FPGA_and_GPU_implementations_of_real-time_stereo_vision/links/0f317539b0c42b50be000000.pdf},
  abstract  = {Real-time stereo vision systems have many applications -from autonomous navigation for vehicles through surveillance to materials handling. Accurate scene interpretation depends on an ability to process high resolution images in real-time, but, although the calculations for stereo matching are basically simple, a practical system needs to evaluate at least 10-9 disparities every second - beyond the capability of a single processor. Stereo correspondence algorithms have high degrees of inherent parallelism and are thus good candidates for parallel implementations. In this paper, we compare the performance obtainable with an FPGA and a GPU to understand the trade-off between the flexibility but relatively low speed of an FPGA and the high speed and fixed architecture of the GPU. Our comparison highlights the relative strengths and limitations of the two systems. Our experiments show that, for a range of image sizes, the GPU manages 2×10-9 disparities per second, compared with 2.6×10-9 disparities per second for an FPGA.},
  keywords  = {gpu, fpga, comparison},
  review    = {- For this application at least, the FPGA implementation issuperior, despite a much slower internal clock.},
  timestamp = {2019-06-12},
}

@Electronic{,
  author    = {Jeff Chase, Brent Nelson, John Bodily, Zhaoyi Wei, and Dah-Jye Lee},
  editor    = {Brigham Young University},
  title     = {Real-Time Optical Flow Calculations on FPGA and GPU Architectures: AComparison Study},
  date      = {2009-04-30},
  url       = {https://www.researchgate.net/profile/Lee_Dah-Jye/publication/224362818_Real-Time_Optical_Flow_Calculations_on_FPGA_and_GPU_Architectures_A_Comparison_Study/links/0c9605327135c229e0000000.pdf},
  abstract  = {FPGA devices have often found use as higher-performance alternatives to programmable processors for implementing a variety of computations. Applications successfully implemented on FPGAs have typically contained high levels of parallelism and have often used simple statically-scheduled control and modest arithmetic. Recently introduced computing devices such as coarse grain reconfigurable arrays, multi-core processors, and graphical processing units (GPUs) promise to significantly change the computational landscape for the implementation of high-speed real-time computing tasks. One reason for this is that these architectures take advantage of many of the same application characteristics that fit well on FPGAs. One real-time computing task, optical flow, is difficult to apply in robotic vision applications in practice because of its high computational and data rate requirements, and so is a good candidate for implementation on FPGAs and other custom computing architectures. In this paper, a tensor-based optical flow algorithm is implemented on both an FPGA and a GPU and the two implementations discussed. The two implementations had similar performance, but with the FPGA implementation requiring 12×more development time. Other comparison data for these two technologies is then given for three additional applications taken froma MIMO digital communication system design, providing additional examples of the relative capabilites of these two technologies.},
  keywords  = {fpga, gpu, comparison},
  review    = {- The results were mixed with similar per-formance provided by both FPGA and GPU platforms forthe optical flow and trellis computations and better performance provided by the FPGA for the pilot detector and timing and channel estimator computations.
- FPGAs possess unrivaled flexbility for combining custom I/O with computation.
- GPUs seem to be more sensitive to compute-to-I/O ratio than FPGAs.},
  timestamp = {2019-06-12},
}

@WWW{,
  author    = {Taylor IoT Kidd},
  editor    = {Intel Developer Zone},
  title     = {Why P scales a C*V^2*f is so obvious},
  date      = {2009-06-29},
  url       = {https://software.intel.com/en-us/blogs/2009/06/29/why-p-scales-as-cv2f-is-so-obvious},
  keywords  = {power, energy, consumption, cmos, physics},
  review    = {- Explanation of the formula},
  timestamp = {2019-06-14},
}

@WWW{,
  author    = {Taylor IoT Kidd},
  editor    = {Intel Developer Zone},
  title     = {Why P scales as C*V^2*f is so obvious (pt 2)},
  date      = {2015-01-01},
  url       = {https://software.intel.com/en-us/blogs/2009/08/25/why-p-scales-as-cv2f-is-so-obvious-pt-2-2},
  keywords  = {power, energy, consumption, cmos, physics},
  timestamp = {2019-06-14},
}

@WWW{,
  editor    = {ViPress.net},
  title     = {Infineon et NXP devant STMicroelectronics au 1er trimestre 2019},
  date      = {2019-05-17},
  url       = {https://www.vipress.net/infineon-et-nxp-devant-stmicroelectronics-au-1er-trimestre-2019/},
  abstract  = {Après avoir déjà dépassé Samsung au 4e trimestre 2018, Intel renforce ainsi son avance sur le Coréen de près de 3 milliards de dollars au 1er trimestre 2019, selon IC Insights. Intel a détrôné Samsung en tant que premier fournisseur de semiconducteurs au 4e trimestre 2018 après avoir perdu sa place au profit de Samsung au 2e trimestre 2019. Alors que Samsung occupait le premier rang du classement pour l’ensemble de 2017 et de 2018, Intel devrait reprendre facilement le premier rang pour l’année entière de 2019, poste qu’il occupait auparavant de 1993 à 2016. Le retournement des marchés des mémoires Drams et flash NAND au cours de l’année écoulée explique ce basculement. Au premier trimestre 2018, les ventes totales de semiconducteurs de Samsung étaient supérieures de 23% à celles d’Intel ; au 1er trimestre 2019, c’est l’inverse : celles d’Intel dépassent celles du Coréen de 23% !},
  keywords  = {semi-conducteur, classement, intel, samsung, ventes},
  timestamp = {2019-06-14},
}

@Electronic{,
  editor    = {Conrad},
  title     = {Kit d’apprentissage de l’électronique pour débutants},
  date      = {2009-07-31},
  url       = {https://produktinfo.conrad.com/datenblaetter/175000-199999/192230-an-01-fr-LERNPAKET_25_ELEKTRONIK_EXPERIMENTE.pdf},
  keywords  = {électronique, composants, conrad, circuit},
  timestamp = {2019-06-15},
}

@Electronic{,
  author   = {Daniel Francisco Gómez Prado},
  editor   = {University of Massachusetts},
  title    = {Tutorial on FPGA routing},
  date     = {2006-08-31},
  url      = {http://sisbib.unmsm.edu.pe/bibvirtualdata/publicaciones/electronica/n17_2006/a04.pdf},
  abstract = {The   entire   CAD   process   that   is   necessary   to   implement   a   circuit   in   an   FPGA   (from   the   RTL   description  of  the  design)  consists  of  the  following  steps: •Logic  optimization. Performs  two-level  or  multi-level    minimization    of    the    Boolean    equations    to    optimize    area,    delay,    or    a    combination of both.  •Technology     mapping.     Transforms     the     Boolean  equations  into  a  circuit  of  FPGA  logic  blocks.   This   step   also   optimizes   the   total   number     of     logic     blocks     required     (area     optimization)  or  the  number  of  logic  blocks  in  time-critical paths (delay optimization).  •Placement.  Selects  the  specific  location  for  each  logic  block  in  the  FPGA,  while  trying  to  minimize    the    total    length    of    interconnect    required. •Routing.   Connects   the   available   FPGA’s   routing    resources1    with    the    logic    blocks    distributed  inside  the  FPGA  by  the  placement  tool,   carrying   signals   from   where   they   are   generated to where they are used.},
  keywords = {routing, fpga, model},
  review   = {- The C [Connection] boxes connect the channel wires with the input and output pins of the CLBs
- The S [Switch] boxes allow wires to switch between vertical and horizontal wires
- Switch boxes that allow connection to any other domain are called Wilton switch boxes, and they are broadly used as they provide greater flexibility on routing
- [Single-length lines] are intended for relatively short connections among CLBs and they span through one CLB only
- [Double-length lines] are similar to the Single-length lines, except that each one spans two CLBs, offering lower routing delays for moderately long connection
- [Long lines] are appropriate for connections that require reaching several CLBs with low-skew.
- Increasing the flexibility of the switch box, the connection box and the number of wires per channel makes routing a trivial problem [17] as all possible interconnections are available. But increasing routing resources has the drawback that waste area and transistors in the FPGA, as only a fraction of those resources will be used for a given design, even worse it increases the number of interconnect transistors which are the principal reason of delay on FPGAs},
}

@WWW{,
  author    = {Marek Vašut},
  editor    = {DENX Software Engineering, The Linux Foundation},
  title     = {Open-Source Tools for FPGA Development},
  date      = {2016-10-11},
  url       = {https://www.youtube.com/watch?v=MI18Wk4gxA4},
  abstract  = {Programmable hardware is becoming increasingly popular in the recent years, yet the software tools for working with such programmable hardware are dominated by closed-source proprietary solutions. This is now changing. In this presentation, Marek will summarize the open-source tools for working with programmable hardware, like "icestorm", "vtr", "ghdl" and "iverilog". Marek will show how to use the open-source tools to produce a working design and explain the benefits and limitations of such solutions. At the end of the talk, Marek will outline the process of implementing such tools to demonstrate why this is so much effort.},
  keywords  = {open source, fpga, tools, development},
  timestamp = {2019-06-19},
}

@Electronic{,
  author    = {Aaron Arenas},
  editor    = {Intel},
  title     = {Introduction to FPGA design in Quartus},
  date      = {2018-08-31},
  url       = {https://fpgawiki.intel.com/uploads/0/07/Intro_to_FPGA_Workshop_Slides.pdf},
  abstract  = {FPGAs at Intel
Fundamentals of Digital Electronics
FPGA Architecture
Intel® Quartus® Prime Design Software
FPGA Design Flow},
  keywords  = {fpga, quartus, intel},
  timestamp = {2019-06-20},
}

@Electronic{,
  editor    = {Altera, Intel},
  title     = {Cyclone V Device Datasheet},
  date      = {2019-01-25},
  url       = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/cyclone-v/cv_51002.pdf},
  keywords  = {cyclone v, fpga, intel, altera, maximum, frequency, performance},
  timestamp = {2019-06-22},
}

@Electronic{,
  author    = {Dr. Handel Jones},
  editor    = {International Business Strategies},
  title     = {Why migration to 20 nm bulk CMOS and 16/14 nm FinFets is not best approach for semiconductor industry},
  date      = {2014-01-31},
  url       = {http://caxapa.ru/thumbs/598000/WP_handel-jones.pdf},
  abstract  = {The growth of the semiconductor industry hashistoricallyhad strong dependenceon the reduction in cost  per  transistor.The  analysisof  the  cost per  gate at  20nm,  however,indicates  that  conventional scaling approaches for bulk CMOS have reached saturation. Asimilar perspective exists for the initial generation of FinFET structuresat 16/14nm where cost per gateis higher than for 20nm and 28nm.High volume applicationsneed lower cost per transistorin order to use the new generation of process technologies.This  pattern  has  been  consistent  since  the  development  of  integrated  circuits.  It  is,consequently, appropriate to evaluate the optionsfor continuing the pattern of lower cost per gate.},
  keywords  = {economic, cmos, engraving, semiconductor, cost per transistor},
  timestamp = {2019-06-26},
}

@Electronic{,
  author    = {Bryan Cantrill},
  editor    = {Joyent},
  title     = {No Moore Left to Give},
  date      = {2019-06-25},
  url       = {https://www.slideshare.net/bcantrill/no-moore-left-to-give-enterprise-computing-after-moores-law},
  keywords  = {moore's law, computing},
  timestamp = {2019-06-26},
}

@WWW{,
  author   = {Ulrich Drepper},
  editor   = {LWN},
  title    = {What every programmer should know about memory, Part 1},
  date     = {2007-09-21},
  url      = {https://lwn.net/Articles/250967/},
  abstract = {In the early days computers were much simpler. The various components of a system, such as the CPU, memory, mass storage, and network interfaces, were developed together and, as a result, were quite balanced in their performance. For example, the memory and network interfaces were not (much) faster than the CPU at providing data.

This situation changed once the basic structure of computers stabilized and hardware developers concentrated on optimizing individual subsystems. Suddenly the performance of some components of the computer fell significantly behind and bottlenecks developed. This was especially true for mass storage and memory subsystems which, for cost reasons, improved more slowly relative to other components.

The slowness of mass storage has mostly been dealt with using software techniques: operating systems keep most often used (and most likely to be used) data in main memory, which can be accessed at a rate orders of magnitude faster than the hard disk. Cache storage was added to the storage devices themselves, which requires no changes in the operating system to increase performance. {Changes are needed, however, to guarantee data integrity when using storage device caches.} For the purposes of this paper, we will not go into more details of software optimizations for the mass storage access.},
}

@WWW{,
  author    = {Manu Suryavansh},
  editor    = {Towards Data Science},
  title     = {Google Coral Edge TPU Board Vs NVIDIA Jetson Nano Dev board - Hardware comparison},
  date      = {2019-04-19},
  url       = {https://towardsdatascience.com/google-coral-edge-tpu-board-vs-nvidia-jetson-nano-dev-board-hardware-comparison-31660a8bda88},
  abstract  = {Both NVidia and Google recently released dev board targeted towards EdgeAI and also at a cost point to attract developers, makers and hobbyists. Both the dev boards are primarily for inference, but support limited transfer learning re-training. The Edge TPU supports transfer learning training using weight imprinting technique. Both of the dev kits consists of a SOM (System-on-Module) connected to a dev board which has various connectors like USB, Ethernet, microSD slots etc. This is a comparison of the hardware for the two dev kits which can be used as Single board computer (SBC) and not the Edge TPU USB stick. If you don’t want to read the whole article, in my opinion the Coral Edge dev kit is slightly better value for the money as it includes essential peripherals like Wifi and Bluetooth however the Jetson Nano has better software support (both INT8 and FP16 Inference).},
  keywords  = {tpu, nvidia, google, coral, jetson},
  timestamp = {2019-07-30},
}

@WWW{,
  author    = {Sam Sterckval},
  editor    = {Noteworthy},
  title     = {Google Coral Edge TPU vs NVIDIA Jetson Nano : A quick deep dive into Edge AI performance},
  date      = {2019-04-15},
  url       = {https://blog.usejournal.com/google-coral-edge-tpu-vs-nvidia-jetson-nano-a-quick-deep-dive-into-edgeai-performance-bc7860b8d87a},
  abstract  = {Recently I’ve been reading, testing, and writing a bit about edge computing (like here, and here), with the main focus on edge AI. With cool new hardware hitting the shelves recently, I was eager to compare performance of the new platforms, and even test them against high performance systems.

Of these 3 bars, 2 of them where achieved by the Google Coral Edge TPU USB accelerator, and the 3rd one was a full blown NVIDIA GTX1080 assisted by an Intel i7–7700K.
The GTX1080 draws a maximum of 180W, which is absolutely HUGE compared to the Corals 2.5W.

Next thing we see, is that the NVIDIA Jetson Nano isn’t scoring well at all. Although it has a CUDA enabled GPU, it’s really not much faster then my old i7–4870HQ. But that’s the catch, ‘not much faster’, it still is faster then a 50W, quad-core, hyperthreading CPU. From a few years back, true, but still. The Jetson Nano never could have consumed more then a short term average of 12.5W, because that’s what I’m powering it with. That’s a 75% power reduction, with a 10% performance increase.},
  keywords  = {jetson, nvidia, google, coral, tpu},
  timestamp = {2019-07-30},
}

@WWW{,
  editor    = {Grus Blog},
  title     = {Comparison of two new machine learning accelerators, Coral and Jetson Nano},
  date      = {2019-05-24},
  url       = {https://blog.grusbv.com/comparison-of-google-coral-and-nvidia-jetson-nano/},
  abstract  = {NVidia favors flexibility with its multiple GPU cores to efficiency, which Google favors with its stripped down core with only 8-bit integer support.
In the few benchmarks that were made available by NVidia, Google excels in application performance, except with the Inception-v4 model. With its tiny size, Coral surprises many, and it is their selling point.
In short, it seems that Nvidia favors upgradability, development flexibility whereas Google favors the performance and connectivity.

I would say that in order to begin working on machine learning on embedded systems, (or as some call AI on the edge) I would recommend the Jetson Nano. Though there is a sense of urgency around the topic and a powerful impetus towards inference performance, development environment still counts. Switching, if necessary would not be much of an issue.

We, on the other hand, are designing on both of them, actually also with many others including Intel Movidius Neural Compute Stick or straight STM32 ARM Cortex microcontrollers with machine learning optimizations. They cover a wide range of the ML inference domain, and should be chosen specifically for the application.},
  keywords  = {google, nvidia, jetson, coral, tpu},
  timestamp = {2019-07-30},
}

@WWW{,
  editor    = {The Economist},
  title     = {After Moore's law, the future of computing},
  date      = {2016-03-12},
  url       = {https://www.economist.com/leaders/2016/03/12/the-future-of-computing},
  abstract  = {The era of predictable improvement in computer hardware is ending.[…] Making transistors smaller no longer guarantees that they will be cheaper or faster. […] Chips will still get better, but at a slower pace (number-crunching power is now doubling only every 2.5 years, says Intel).
The future of computing will be defined by improvements in three other areas, beyond raw hardware performance : software, cloud and new computing architectures.
For the technology industry itself, the decline of Moore’s law strengthens the logic for centralised cloud computing, already dominated by a few big firms: Amazon, Google, Microsoft, Alibaba, Baidu and Tencent.},
  keywords  = {moore, computing, future},
  timestamp = {2019-08-09},
}

@WWW{,
  author    = {Gordon Haff},
  editor    = {The Enterprisers Project},
  title     = {What comes after Moore's law},
  date      = {2018-09-26},
  url       = {https://enterprisersproject.com/article/2018/9/what-comes-after-moores-law},
  abstract  = {Moore’s Law has also led to a computing environment that tends to be dominated by general-purpose designs. […] Reasonable people may disagree on exactly where we sit in the Moore’s Law endgame, a question that’s increasingly hard to even precisely frame as process nodes like 10nm become more akin to marketing terms than literal descriptions of feature sizes. […] However, no one seriously disputes that Moore’s Law is getting close to fundamental physical limits as processor features are approaching the size of atoms.},
  keywords  = {moore, computing, future},
  review    = {One suspects that we’ll also see some even more fundamental changes in the basic architectures of computer systems over time. Let’s look at four of those trends:
1) Ephemeral applications. We used to think of software in organizations as being much longer-lived than the hardware it ran on. […] Hardware stability still matters, but it’s not the near-universal mandate it once was.
2) New workloads, such as machine learning (GPU, TPU, FPGA, ARM…)
3) Cloud platforms […] Large cloud providers have both the scale and the financial incentives to optimize their hardware around the specific services that they offer to a degree that will be hard for end users to do on a case-by-case basis. The downside is that this, in some respects, looks like a return to a proprietary hardware and software stack.
4) Open source software},
  timestamp = {2019-08-09},
}

@WWW{,
  author    = {Chaim Gartenberg},
  editor    = {The Verge},
  title     = {How Intel’s 9th Gen chips show the way forward after Moore’s Law},
  date      = {2018-10-11},
  url       = {https://www.theverge.com/circuitbreaker/2018/10/11/17963356/intel-9th-gen-chips-moores-law-future-processors-cores},
  abstract  = {Intel’s 9th Gen Core processors are here, and, as expected, they’re simply a refresh of the last generation of chips, with the same 14nm process the company’s been using since 2014. […] Intel still hasn’t managed to move on from its 14nm manufacturing node to the next step, its repeatedly delayed 10nm process.},
  keywords  = {intel, next generation, processor},
  timestamp = {2019-08-09},
}

@WWW{,
  editor    = {The Economist},
  title     = {The end of Moore’s law},
  date      = {2015-04-19},
  url       = {https://www.economist.com/the-economist-explains/2015/04/19/the-end-of-moores-law},
  keywords  = {moore, future, computing},
  timestamp = {2019-08-09},
}

@WWW{,
  author   = {Lamont Wood},
  editor   = {Computerworld},
  title    = {CPU architecture after Moore’s law: what’s next?},
  date     = {2017-07-24},
  url      = {https://www.computerworld.com/article/3209724/cpu-architecture-after-moores-law.html},
  abstract = {“In five years we will be 10% ahead of where we are now,” [Jim Turley, founder of Silicon Insider] predicts. “Every few years there is a university research project that thinks they are about to overturn the tried-and-true architecture that John von Neumann and Alan Turing would recognize — and unicorns will dance and butterflies will sing. It never really happens, and we just make the same computers go faster and everyone is satisfied. In terms of commercial value, steady, incremental improvement is the way to go.”
“Power dissipation is the whole deal,” says Tom Conte, a professor at the Georgia Institute of Technology and past president of the IEEE Computer Society. “Removing 150 watts per square centimeter is the best we can do without resorting to exotic cooling, which costs more.”
“Eight things in parallel is about the limit, and hardly any programs use more than three or four cores. So we have run into a wall on getting speed from cores. The cores themselves are not getting much wider than 64 bits. Intel-style cores can do about five instructions at a time, and ARM cores are up to three, but beyond five is the point of diminishing returns, and we need new architecture to get beyond that. The bottom line is traditional software will not get much faster.” − Linley Gwennap, analyst at The Linley Group.},
}

@WWW{,
  author   = {Alison DeNisco Rayome},
  editor   = {TechRepublic},
  title    = {How programming will change over the next 10 years: 5 predictions},
  date     = {2018-09-17},
  url      = {https://www.techrepublic.com/article/how-programming-will-change-over-the-next-10-years-5-predictions/},
  abstract = {1) Programming will be more abstract
2) AI will become part of every developer's toolkit, but won't replace them. Development tools will try to predict developers' intent and make it quicker for them to express that intent, which in the end, becomes another form of abstraction
3) A universal programming language will arise. To reap the benefits of emerging technologies like AI, programming has to be easy to learn and easy to build upon.
4) Every developer will need to work with data. Developers of the future will need to learn more skills, particularly in data analysis.
5) Programming will be a core tenet of the education system.},
  keywords = {programming, future},
}

@WWW{,
  author    = {Jeffrey Burt},
  editor    = {The Next Platform},
  title     = {FPGA make Xilinx says the future of computing if ACAP},
  date      = {2018-03-19},
  url       = {https://www.nextplatform.com/2018/03/19/fpga-maker-xilinx-says-the-future-of-computing-is-acap/},
  keywords  = {xilinx, fpga, future, computing},
  timestamp = {2019-08-09},
}

@WWW{,
  author    = {Michael Feldman},
  editor    = {The Next Platform},
  title     = {With Agilex, Intel gets a coherent FPGA strategy},
  date      = {2019-04-02},
  url       = {https://www.nextplatform.com/2019/04/02/with-agilex-intel-gets-a-coherent-fpga-strategy/},
  keywords  = {intel, agilex},
  timestamp = {2019-08-09},
}

@WWW{2019-08-14,
  editor    = {Embedded Related},
  title     = {When (and why) is it a good idea to use an FPGA in your embedded system design?},
  date      = {2017-12-20},
  url       = {https://www.embeddedrelated.com/thread/4878/when-and-why-is-it-a-good-idea-to-use-an-fpga-in-your-embedded-system-design},
  keywords  = {determinism, performance, IO flexibility/complicated interfaces, rapidly evolving standards, security},
  timestamp = {2019-08-14},
}

@WWW{,
  author    = {Jeff Johnson},
  title     = {Outsourcing FPGA design: pros and cons},
  date      = {2011-08-08},
  url       = {http://www.slideshare.net/jeffjohnsonau/outsourcing-fpga-design-pros-and-cons},
  abstract  = {manage fluctuating workloads, specialized expertise (learning curve is steep and long, experienced FPGA developers are hard to find), start projects sooner, greater flexibility},
  keywords  = {outsourcing, fpga},
  timestamp = {2019-08-14},
}

@WWW{,
  author    = {Tim Dettmers},
  title     = {Is implementing deep learning on FPGAs a natural next step after the success with GPUs?},
  date      = {2015-04-28},
  url       = {https://www.quora.com/Is-implementing-deep-learning-on-FPGAs-a-natural-next-step-after-the-success-with-GPUs},
  keywords  = {deep learning, fpga, gpu},
  timestamp = {2019-08-14},
}

@WWW{,
  author    = {Jeremy Fowers, Greg Brown, Patrick Cooke, Greg Stitt},
  editor    = {University of Florida},
  title     = {A performance and energy comparison of FPGAs, GPUs and Multicores for sliding-window applications},
  date      = {2012-02-21},
  url       = {http://www.gstitt.ece.ufl.edu/courses/spring13/eel4712/lectures/fpga130-fowers.pptx},
  keywords  = {performance, energy, comparison, fpga, gpu, multicore},
  timestamp = {2019-08-14},
}

@Electronic{,
  editor    = {Berten Digital Signal Processing},
  title     = {GPU vs FPGA Performance Comparison (white paper)},
  date      = {2016-05-19},
  url       = {http://www.bertendsp.com/pdf/whitepaper/BWP001_GPU_vs_FPGA_Performance_Comparison_v1.0.pdf},
  keywords  = {comparison, gpu, fpga, white paper},
  timestamp = {2019-08-14},
}

@WWW{,
  title     = {An FPGA-based supercomputer for statistical physics: the weird case of Janus},
  date      = {2012-05-30},
  url       = {https://www.eweb.unex.es/eweb/fisteor/juan/PUBLICATIONS/janusSubmitted.pdf},
  timestamp = {2019-08-14},
}

@Comment{jabref-meta: databaseType:biblatex;}