-
Notifications
You must be signed in to change notification settings - Fork 0
/
fpga.bib
1230 lines (1126 loc) · 89 KB
/
fpga.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Encoding: UTF-8
@WWW{,
author = {Tim VERRY},
editor = {PC Perspective},
title = {Apple’s A6 processor uses hand drawn ARM cores to boost performance},
date = {2012-09-27},
url = {https://pcper.com/2012/09/apples-a6-processor-uses-hand-drawn-arm-cores-to-boost-performance/},
keywords = {ARM, A15, Apple, A6, iPhone 5,hand drawn},
review = {- Texas Instruments is giving up on ARM chips for tablets and smartphones
- hand drawn designs are becoming increasingly rare
- AMD has given up hand drawn design with Steamroller
- Apple and the engineers acquired from its purchase of PA Semi have manually drawn out the processor by hand},
timestamp = {2019-06-08},
}
@WWW{,
editor = {Wikipedia},
title = {P.A. Semi},
year = {2013},
url = {https://en.wikipedia.org/wiki/P.A._Semi},
keywords = {fabless, semiconductor, acquisition, Apple},
review = {- On 23 April 2008, Apple announced that they had acquired P.A. Semi.
- On 11 June 2008, during the annual Worldwide Developer’s Conference, Apple CEO Steve Jobs said that the acquisition was meant to add the talent of P.A. Semi’s engineers to Apple’s workforce and help them build custom chips for the iPod, iPhone and other future mobile devices such as the iPad.},
timestamp = {2019-06-08},
}
@Electronic{,
author = {Charles WEBER and C. Neil BERGLUND and Patricia GABELLA},
editor = {Portland State University},
title = {Mask cost and profitability in photomask manufacturing, an empirical analysis},
date = {2006-11},
url = {http://web.pdx.edu/~webercm/documents/2006%20November%20IEEE%20TSM%20Weber%20Berglund%20Gabella.pdf},
abstract = {An empirical study of the economics of manufacturing photomasks concludes that the
uncontrolled growth of optical proximity effect correction and resolution enhancement techniques is
driving up the cost of pattern generation and mask inspection to levels that threaten the profitability of
photomask manufacturing. The intrinsic cost of some leading edge photomasks has already exceeded
the price that customers are willing to pay for them. A model of the lifecycle of photomask manufacturing,
developed from interviews involving the 1990 to 2005 operations of six mask shops and a survey of
seven photomask manufacturers, shows that design for manufacturability (DFM) constitutes the most
promising approach for alleviating this market impasse. Unilateral action by mask shops to increase their
capital productivity is necessary but insufficient and perhaps unaffordable. DFM solutions will require the
majority of participants in the lithography value chain to collaborate according to a volatile demand
schedule that is driven by semiconductor manufacturers.},
keywords = {masks, costs, profitability, photomask, manufacturing},
review = {- In 2006, the cost of a mask set is ~US$ 500k for the 130 nm technology
- 65 nm node is expected to be ~US$ 2 million
- 90 nm node is expected to be ~US$ 1 million},
timestamp = {2019-06-08},
}
@Electronic{,
author = {A. B. Kahng and Y. C. Pati},
editor = {UCLA Department of Computer Science and Numerical Technologies, Inc},
title = {Subwavelength optical lithography, challenges and impact on physical design},
date = {1999-04-15},
url = {https://vlsicad.ucsd.edu/Publications/Conferences/94/c94.pdf},
abstract = {We review the implications of subwavelength optical lithography for new tools and ows in the interface between layout design and manufacturability. After discussing the necessity of corrections for optical process effects (i.e., use of optical proximity correction (OPC) and phase-shifting masks (PSM)), we focus on the implications of OPC and PSM for layout and verication methodologies. Our discussion addresses the necessary changes in the design-to- manufacturing ow, including infrastructure development in the mask and process communities as well as opportunities for research and development in physical layout and verification.},
timestamp = {2019-06-08},
}
@Electronic{,
author = {Unknown},
editor = {Unknown},
title = {6502 Schematic},
date = {2007-11-14},
url = {https://downloads.reactivemicro.com/Electronics/CPU/6502%20Schematic.pdf},
keywords = {6502, schematic},
review = {- it shows how instruction are decoded},
timestamp = {2019-06-09},
}
@WWW{,
author = {Russ Cox},
title = {The MOS 6502 and the Best Layout Guy in the World},
date = {2011-01-03},
url = {https://research.swtch.com/6502},
abstract = {What are the key designs of the 6502 compared to other processors of its time},
}
@Electronic{,
editor = {Computer History Museum},
title = {Oral history panel on the development and promotion of the Motorola 68000},
date = {2007-07-23},
url = {https://archive.computerhistory.org/resources/access/text/2012/04/102658164-05-01-acc.pdf},
keywords = {68000, motorola},
review = {- [For the 68000] All of that was done by hand. We didn't even need graphic programs to do it. At that time you drew it out in detail, on the mylar, and then you digitized it.
- we used CALMA for digitizing.
- Paper schematics, hand–drawing layout
- [how we really did logic simulation] Breadboards
- That was the early 1990s, and our 68060 was the first device that went into three of the gigantic Quick Time FPGA boxes, strapped together with big cables.},
timestamp = {2019-06-09},
}
@WWW{,
author = {John McMillan},
editor = {Mentor, a Siemens Business},
title = {PCB design then and now},
date = {2015-07-14},
url = {https://blogs.mentor.com/jimmartens/blog/2015/07/14/pcb-design-then-and-now/},
keywords = {pcb, cad, calma, digitizing},
review = {- talks about Calma Systems, a CAD system in then 70-80's
- a separate computer room complete with a raised floor hiding all the AC ducts and cabling housed all the data storage systems that served and backed-up the workstations.
- there was also a huge HP plotter used to print out each layer of digitized routes that would eventually be scaled and photographed on to clear film in the lab located across the hall.
- the industry has come a long, long way from hand-taping designs and digitizing hand-drawn layers as done with the Calma systems that we literally usedto build rooms around.},
timestamp = {2019-06-09},
}
@WWW{,
author = {David E. Weisberg},
title = {The Engineering Design Revolution},
year = {2008},
url = {http://www.cadhistory.net/11%20CALMA.pdf},
keywords = {cad, engineering, calma},
timestamp = {2019-06-09},
}
@WWW{,
editor = {Museum Waalsdorp},
title = {Computers for electronic and mechanical engineering},
year = {2019},
url = {https://www.museumwaalsdorp.nl/en/history/comphistory/computer-history-the-period-1986-1989/comp866e/},
keywords = {calma, cad, engineering, pcb},
timestamp = {2019-06-09},
}
@WWW{,
editor = {Ucamco, former Barco ETS},
title = {Cilbr8tor Series},
year = {2016},
url = {https://www.ucamco.com/en/hardware/photoplotters/calibr8tor/calibr8tor-series},
keywords = {photoplotter},
review = {- minimum line width: 5 µm
- max precision : 50800 ppi},
timestamp = {2019-06-09},
}
@Book{,
author = {Mitchell Waite},
title = {Computer Graphics Primer},
year = {1979},
editor = {Howard W. Sams \& Co., Inc.},
subtitle = {At the cutting edge},
url = {https://www.atariarchives.org/cgp/Ch02_Sec25.php},
abstract = {Perhaps no single technology has had more impact on people than television. Yet according to the experts the real impact is just starting.
The reason? Home computers that connect to a standard television and convert it into a machine with more raw power than any product ever offered to the consumer and with the capability to completely alter the way we relate to the visual world of electronics.
This book is about one of the most exciting uses of the new home computer products—computer graphics—the ability to create complex drawings, plans, maps, and schematics on the screen of an ordinary black-and-white or color television. It is divided into three chapters. Chapter 1, “Perspectives,” presents what the entirely new field of home computer graphics is all about, explains how it got started, and illustrates some of the exciting applications for low-cost graphics displays. Chapter 2, “Basic Concepts,” introduces the general hardware and software concepts behind computer graphics and continues by presenting a profile of the numerous products on the market today. A section on graphics accessories is also included.
Chapter 3, the meat of the book, is entitled “Graphics Programming.” It introduces the graphics features of the Apple II computer used for this book, and then goes on to describe these concepts: plotting simple equations; drawing lines and vectors; creation of simple geometric shapes (rectangles, triangles, polygons, circles) as well as gaming figures (small tanks, jets, cars, rackets, animals); mandalas and other computer art effects, including tunneling; shape shifting, random artwork; detailed drawings and the use of digitizing tables; and, finally, moving figure animation.
The first two chapters of the book can be read any time and will be of help in evaluating which personal computer to buy for graphics work. The third chapter can be studied whether or not you own a computer, but your understanding will certainly be enhanced if one is available to practice the examples on.
The author hopes that you find this journey into computer graphics exciting, comprehensive, and, most of all, enjoyable.},
keywords = {calma, engineering},
review = {- The price tag on the GDS-II is a healthy $250,000.
- So far Calma has sold over 20 GDS-II systems and has many more on order.},
timestamp = {2019-06-09},
}
@Article{,
author = {Robert Sugarman},
title = {Does the country need a good $20 microprocessor?},
journal = {The Engineering Newspaper for the Electronics Industry},
date = {1975-08-25},
url = {https://www.commodore.ca/gallery/magazines/misc/mos_605x_team_eetimes_august_1975.pdf},
abstract = {MOS Technology MCS650X microprocessor designers gather around a 200X print of the CPU Rubylith, color-coded for debugging into metallization, polysilicon and diffusion layers.
In the background is a 1000X expansion of the internal 21X143 decode-ROM, which manager Chuck Peddle claims is a key factor in obtaining small chip size.},
keywords = {6502, rubylith, engineering, chuck peddle},
review = {- 6502 was hand-drawn
- the mask used hand-cut Rubylith.},
timestamp = {2019-06-09},
}
@WWW{,
author = {Kaitlyn Franz},
editor = {Digilent Inc},
title = {History of the FPGA},
date = {2015-01-16},
url = {https://blog.digilentinc.com/history-of-the-fpga/},
keywords = {fpga, timeline, technology},
review = {- 1960 first MOSFET
- 1961 first communication IC
- 1962 first TTL
- 1963 first CMOS
- 1965 Moore’s law
- 1970 PROM
- 1971 EPROM
- 1972 DST
- 1975 PLA (Programmable Logic Array)
- 1978 PAL (Programmable Array Logic)
- 1983 EEPROM
- 1983 GAL (Generic Array Logic)
- 1984 FLASH (~EEPROM)
- 1985 first FPGA},
timestamp = {2019-06-10},
}
@WWW{,
editor = {Hardwarebee},
title = {Field Programmable Gate Array (FPGA) History and Applications},
date = {2018-02-23},
url = {http://hardwarebee.com/field-programmable-gate-array-fpga-history-applications/},
keywords = {altera, fpga, history},
review = {- The first reprogrammable logic device was created in 1984 by a company called Altera.
- It was the EP300 and offered a window that let an ultra-violet light onto EPROM cells, so they could be erased},
timestamp = {2019-06-10},
}
@WWW{,
author = {Daniel Nenni},
title = {A Brief History of FPGAs},
date = {2012-08-26},
url = {https://semiwiki.com/fpga/1596-a-brief-history-of-fpgas/},
keywords = {xilinx, fpga, history},
review = {- In the 80's semiconductors cost millions of dollars to design and manufacture
- FPGAs also dramatically reduced time to market for electronic products.
- Ross Freeman worked at Zilog before
- Ross Freeman created Xilinx in 1984
- Seiko started manufacturing the first FPGAs for Xilinx in 1985 using a very mature 1.2 micron process.
- the first Xilinx FPGA was a 1000 ASIC gate equivalent running at 18MHZ.},
timestamp = {2019-06-10},
}
@Electronic{,
editor = {Computer History Museum},
title = {Altera EP300 Design \& Development Oral History Panel},
date = {2009-10-20},
url = {https://archive.computerhistory.org/resources/access/text/2012/10/102702147-05-01-acc.pdf},
keywords = {altera, ep300, interview, Source III},
timestamp = {2019-06-10},
}
@Electronic{,
editor = {Gould Electronics},
title = {Electrically Erasable Programmable Logic PEEL 18CV8},
date = {2013-07-14},
url = {https://www.datasheetarchive.com/pdf/download.php?id=3ae6b7f4f1c26b281f249beac3c15d411ba916&type=O},
}
@WWW{,
author = {John Culver},
editor = {The CPU Shack},
title = {How a CPU Microprocessor is made},
date = {2011-04-20},
url = {http://www.cpushack.com/MakingWafers.html},
keywords = {wafer, silicon, fabrication},
review = {- wafer is mainly made of silicon with electrically active elements such as arsenic, boron, phosphorous or antimony.},
timestamp = {2019-06-11},
}
@WWW{,
author = {Dylan McGrath},
editor = {Electronic Engineering Times},
title = {FPGA startups stare down giants and ghosts},
date = {2009-07-27},
url = {https://www.eetimes.com/document.asp?doc_id=1263547},
keywords = {timeline, fpga, vendors, xilinx, altera},
review = {- Timeline of programmable logic vendors
- Because programmable logic vendors have traditionally provided software design tools to users at very low or no cost, the price of entry in this market includes not just silicon R&D, but software R&D as well.
- Market watcher Gartner Inc. estimates that Xilinx Inc. and Altera Corp. together accounted for nearly 87 percent of the programmable logic market in 2008
- Analysts like Lewis and Rich Wawrzyniak of Semico Research Corp. attribute the rise in programmable logic startup activity at least partially to the expiration of several key patents once held by the established players.
- Patents considered critical to the birth of FPGAs began expiring a few years ago. They include the original FPGA patent (U.S. Patent No. 4,870,302), issued to Xilinx co-founder Ross Freeman in 1988.
- Xilinx has more software engineers than hardware engineers; at Altera, the mix is roughly 50-50.},
timestamp = {2019-06-11},
}
@Patent{,
author = {Ross H. Freeman},
title = {Configurable electrical circuit having configurable logic elements and configurable interconnects},
number = {US4870302A},
date = {1989-09-26},
holder = {Xilinx Inc},
type = {patentus},
url = {https://patents.google.com/patent/US4870302A/},
abstract = {A configurable logic array comprises a plurality of configurable logic elements variably interconnected in response to control signals to perform a selected logic function. Each configurable logic element in the array is in itself capable of performing any one of a plurality of logic functions depending upon the control information placed in the configurable logic element. Each configurable logic element can have its function varied even after it is installed in a system by changing the control information placed in that element. Structure is provided for storing control information and providing access to the stored control information to allow each configurable logic element to be properly configured prior to the initiation of operation of the system of which the array is a part. Novel interconnection structures are provided to facilitate the configuring of each logic element.},
keywords = {fpga, xilinx},
review = {- 1988-02-19, application filed by Xilinx Inc
- 2006-09-26, anticipated expiration},
timestamp = {2019-06-11},
}
@WWW{,
author = {Brandon HKallaher},
editor = {Digilent Blog},
title = {PAL vs. CPLD vs. FPGA},
date = {2016-08-10},
url = {https://blog.digilentinc.com/pal-vs-cpld-vs-fpga/},
keywords = {pal, cpld, fpga, comparison, use case},
review = {- PALs are made using two building blocks: a logic plane and output logic cells.
- PALs generally have around 20 I/O pins
- The main advantage of a CPLD (Complex Programmable Logic Device) over a PAL is the larger number of available gates and I/O pins.
- A typical use case for a CPLD is to configure an FPGA upon boot.
- Field Programmable Gate Arrays (FPGAs) are completely reconfigurable devices that have gate counts in the millions and hundreds of I/O pins.
- [FPGA] allow for highly complex designs, such as processors, to be created and tested.},
timestamp = {2019-06-11},
}
@WWW{,
author = {Olivier Carton},
editor = {Institut de Recherche en Informatique Fondamentale},
title = {Transistors et portes logiques},
date = {2006-09-12},
url = {https://www.irif.fr/~carton/Enseignement/Architecture/Cours/Gates/},
keywords = {porte, gate, transistor, not, nand, nor, xor, cmos},
review = {- La porte not peut être réalisée en logique CMOS par un circuit constitué de 2 transistors
- [La porte nand peut être réalisée en logique CMOS par un circuit] constitué de 4 transistors dont 2 n-MOS et 2 p-MOS.
- [La porte nor peut être réalisée en logique CMOS par un circuit] constitué de 4 transistors dont 2 n-MOS et 2 p-MOS.
- La porte and peut être réalisée en logique CMOS par un circuit constitué de 6 transistors dont 3 n-MOS et 3 p-MOS.
- La porte xor peut être réalisée en logique CMOS par un circuit constitué de 14 transistors dont 7 n-MOS et 7 p-MOS.},
timestamp = {2019-06-11},
}
@Electronic{,
editor = {Altera, Intel},
title = {Cyclone V Device Handbook: Volume 1: Device Interfaces and Integration, Logic Array Blocks and Adaptive Logic Modules in Cyclone V Devices},
date = {2019-05-15},
url = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/cyclone-v/cv_5v2.pdf},
keywords = {altera, intel, cyclone v},
timestamp = {2019-06-11},
}
@WWW{,
editor = {Intel},
title = {Intel completes acquisition of Altera},
date = {2015-12-28},
url = {https://newsroom.intel.com/news-releases/intel-completes-acquisition-of-altera/},
abstract = {Intel Corporation (“Intel”) today announced that it has completed the acquisition of Altera Corporation (“Altera”), a leading provider of field-programmable gate array (FPGA) technology. The acquisition complements Intel’s leading-edge product portfolio and enables new classes of products in the high-growth data center and Internet of Things (IoT) market segments.},
}
@WWW{,
author = {Stacey Higginbotham},
editor = {Fortune},
title = {Why Intel will spend $16.7 billion on Altera},
date = {2015-08-27},
url = {http://fortune.com/2015/08/27/why-intel-altera/},
keywords = {acquisition, intel, altera},
review = {- Three months ago [may 2015] Intel said it would buy chip maker Altera in a deal valued at $16.7 billion
- Jason Waxman, the VP & GM of the cloud platforms group at Intel, […] said that by 2020 Intel believes a third of the data center market could be using the type of chips that Altera specializes in.},
timestamp = {2019-06-11},
}
@WWW{,
editor = {BitFusion},
title = {BitFusion, the elastic AI infrastructure for multi-cloud},
date = {2019-06-11},
url = {https://bitfusion.io/},
keywords = {gpu, fpga, asic, ai, performance},
review = {- Virtual remote attached GPUs, FPGAs and ASICs for any AI application
- 2x performance boost and 2-4x cost reduction for your AI training and inference deployment},
timestamp = {2019-06-11},
}
@WWW{,
author = {Stacey Higginbotham},
editor = {GigaOM},
title = {Why Microsoft is building programmable chips that specialize in search},
date = {2014-06-16},
url = {https://gigaom.com/2014/06/16/why-microsoft-is-building-programmable-chips-that-specialize-in-search/},
keywords = {microsoft, fpga, cpu, comparison},
review = {- According to Doug Burger, the Microsoft Research employee quoted in the Wired story, the the FPGAs are 40 times faster than a generic Xeon CPU when it comes to running Microsoft’s algorithms.
- That explains why Microsoft, and other webscale giants from Amazon to Google are investigating different chip architectures for their servers.
- And Microsoft’s decision to test FPGAs is doubly interesting because they can actually be re-programmed when the company’s algorithms change, making them a costly, but flexible option. And if there’s one thing we know about the cloud, it’s that flexibility trumps cost.},
timestamp = {2019-06-11},
}
@WWW{,
author = {Derrick Harris},
editor = {GigaOM},
title = {Microsoft is building fast, low-power neural networks with FPGAs},
date = {2015-02-23},
url = {https://gigaom.com/2015/02/23/microsoft-is-building-fast-low-power-neural-networks-with-fpgas/},
keywords = {microsoft, ai, fpga, neural network},
timestamp = {2019-06-11},
}
@Electronic{,
author = {Kalin Ovtcharov, Olatunji Ruwase, Joo-Young Kim, Jeremy Fowers, Karin Strauss, Eric S. Chung},
editor = {Microsoft Research},
title = {Accelerating deep convolutional neural networks using specialized hardware},
date = {2015-02-22},
url = {https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CNN20Whitepaper.pdf},
abstract = {Recent breakthroughsin the development of multi-layer convolutional neural networkshave led to state-of-the-art improvements in the accuracy of non-trivial recognition tasks such as large-category image classificationand automaticspeech recognition.These many-layered neural networks are large, complex,and require substantial computingresourcesto train and evaluate. Unfortunately, these demands come at an inopportunemoment due to the recent slowing of gains in commodity processor performance.Hardware specializationin the form of GPGPUs, FPGAs, and ASICs offers a promising path towards major leaps in processing capabilitywhile achieving high energy efficiency. To harness specialization, an effort is underwayat Microsoft to accelerate Deep Convolutional Neural Networks (CNN) usingservers augmented with FPGAs—similar to the hardware that is being integrated intosome of Microsoft’s datacenters. Initial efforts to implement a single-node CNN accelerator on a mid-rangeFPGA showsignificant promise, resulting in respectableperformance relative to prior FPGA designs and high-end GPGPUs, at a fraction of the power. In the future, combining multiple FPGAs over a low-latency communication fabric offers further opportunity to train and evaluate models of unprecedented size and quality},
keywords = {microsoft, neural network, ai, fpga},
review = {- 233 images/sec for 25 W for FPGA Arria 10 [9.32 images/sec/watt] vs 824 images/sec for 235 W for Tesla K40 [3.5 images/sec/watt] on ImageNet 1K},
timestamp = {2019-06-11},
}
@Electronic{,
author = {Gordon Earle Moore},
editor = {Fairchild Semiconductor},
title = {Cramming more components onto integrated circuits},
date = {1965-04-19},
url = {https://newsroom.intel.com/wp-content/uploads/sites/11/2018/05/moores-law-electronics.pdf},
abstract = {With unit cost falling as the number of components percircuit rises, by 1975 economics may dictate squeezing asmany as 65,000 components on a single silicon chip.},
keywords = {moore's law, integrated circuit, fairchild semiconductor},
review = {- En 1965, Moore extrapolait que le nombre de composants d’un circuit intégré doublerait chaque année pendant au moins 10 ans pour atteindre 65000 en 1975
- Son extrapolation ne parlait pas des décennies suivantes
- Computers will be more powerful, and will be organized in completely different ways.},
timestamp = {2019-06-11},
}
@Electronic{,
author = {Gordon Earle Moore},
editor = {Intel},
title = {Progress in digital integrated electronics},
year = {1975},
url = {https://www.eng.auburn.edu/~agrawvd/COURSE/E7770_Spr07/READ/Gordon_Moore_1975_Speech.pdf},
review = {- The new slope might approximate a doubling every two years, rather than every year, by the end of the decade.},
}
@Electronic{,
editor = {UNIT (Université Numérique Ingénierie et Technologie)},
title = {"Les grands mythes fondateurs" des nanos : la loi de Moore ou l'héritage du talk de Feynman de 1959},
date = {2015-06-21},
url = {http://www.unit.eu/cours/enjeux-nanosciences-nanotechnologies/Module3-FR.pdf},
abstract = {Précisions sur les prévisions de Gordon Earle Moore et la fameuse loi de Moore.},
keywords = {myth, moore's law, richard feynman, nanotechnology},
review = {Précisions sur les prévisions de Gordon Earle Moore et la fameuse loi de Moore.},
timestamp = {2019-06-11},
}
@WWW{,
author = {Lynnette Reese},
editor = {Embedded Intel Solutions},
title = {Comparing hardware for artificial intelligence: FPGAs vs. GPUs vs. ASICs},
date = {2018-07-24},
url = {https://eecatalog.com/intel/2018/07/24/comparing-hardware-for-artificial-intelligence-fpgas-vs-gpus-vs-asics/},
keywords = {fpga, asic, gpu, cpu, comparison},
review = {- FPGAs [and ASICs] offer lower latency than GPUs or CPUs which is better for applications that require real-time AI.
- Another area where FPGAs outperform GPUs (and CPUs) is for those applications with a constrained power envelope.
- FPGAs are similar to ASICs except that FPGAs are notoriously difficult to program and ASICs have a typical production cycle time of 12 – 18 months
- Both GPUs and FPGAs can process in parallel on a massive scale. However, FPGAs also surpass GPUs for efficiency in parallel processing
- Although historically complex to program, FPGAs are carving out their own space in AI technology, with new tools that make programming AI applications that much easier.},
timestamp = {2019-06-11},
}
@Electronic{,
editor = {Arrow},
title = {FPGA vs CPU vs GPU vs Microcontroller},
date = {2018-10-02},
url = {https://static4.arrow.com/-/media/images/research-and-events/articles/1018
/arrow_fpgavscpuvsgpuvsmicrocontroller.pdf},
abstract = {Tableaux comparatifs des points forts des différentes technologies},
keywords = {fpga, cpu, gpu, comparison, asic},
timestamp = {2019-06-11},
}
@WWW{,
author = {David Feugey},
title = {Altera mise sur l’OpenCL pour révolutionner le monde des FPGA},
date = {2011-11-16},
url = {https://www.silicon.fr/altera-mise-sur-l%E2%80%99opencl-pour-revolutionner-le-monde-des-fpga-65255.html},
keywords = {opencl, cuda, gpu, fpga, altera},
review = {- il y a cinq ans [en 2006], NVIDIA lançait CUDA, une technologie permettant d’exploiter simplement (relativement) la puissance des GPU intégrés aux cartes graphiques dans le cadre de calculs massivement parallèles
- Le support de l’OpenCL au sein des FPGA d’Altera n’en est qu’à ses prémices},
timestamp = {2019-06-11},
}
@WWW{,
author = {Damien Dubuc},
editor = {Aneo},
title = {Afin de terminer notre série de billets, voici quelques réflexions et perspectives que nous ressortons de l’étude},
date = {2018-02-06},
url = {https://www.aneo.eu/2018/02/06/perspectives-quant-a-lutilisation-fpga-chez-aneo-billet-8/},
review = {- Sur FPGA, il est coûteux d’itérer sur un nouveau design, et le grand nombre d’optimisations possibles (différant par leur nature et paramètres) pose une difficulté.},
timestamp = {2019-06-11},
}
@Electronic{,
editor = {Altera},
title = {Implementing FPGA design with the OpenCL standard},
date = {2013-11-14},
url = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/wp/wp-01173-opencl.pdf},
abstract = {The initial era of programmable technologies contained two different extremes of programmability. As illustrated in Figure 1, one extreme was represented by single core CPU and digital signal processing (DSP) units. These devices were programmable using software consisting of a list of instructions to be executed. These instructions were created in a manner that was conceptually sequential to the programmer, although an advanced processor could reorder instructions to extract instruction-level parallelism from these sequential programs at run time. In contrast, the other extreme of programmable technology was represented by the FPGA. These devices are programmed by creating configurable hardware circuits, which execute completely in parallel. A designer using an FPGA is essentially creating a massively-fine-grained parallel application. For many years, these extremes coexisted with each type of programmability being applied to different application domains. However, recent trends in technology scaling have favored technologies that are both programmable and parallel.},
keywords = {opencl, altera, fpga},
timestamp = {2019-06-11},
}
@Electronic{,
author = {Dmitry Denisenko},
editor = {Intel},
title = {OpenCL for FPGAs},
date = {2016-06-18},
url = {https://cpufpga.files.wordpress.com/2016/04/opencl_for_fpgas_isca_2016.pdf},
keywords = {opencl, fpga, altera, intel},
review = {- How OpenCL concepts map to FPGA architecture
- description of LUTs
- What is OpenCL?},
timestamp = {2019-06-11},
}
@Electronic{,
author = {David Castells-Rufas},
editor = {Cephis},
title = {Workshop: programming FPGAs with OpenCL},
date = {2018-05-05},
url = {http://www.sie.es/wp-content/uploads/2018/06/FPGA-with-OpenCL.pdf},
keywords = {opencl, fpga, energy efficiency},
review = {- History of energy efficiency
- Dennard Scaling Rules
- Why FPGAs can be energy efficient?
- What FPGAs can provide? Remove intermediate memory from computation datapaths, allow much higher number simultaneous computation units, fine grain (bit level) computation
- Problems: lower frequency due to overheads, difficult to program (HDL)},
timestamp = {2019-06-11},
}
@WWW{,
author = {Vincent Hindriksen},
editor = {StreamHPC},
title = {Why use OpenCL on FPGAs?},
date = {2014-09-16},
url = {https://streamhpc.com/blog/2014-09-16/use-opencl-fpgas/},
abstract = {Altera has just released the free ebook FPGAs for dummies. One part of the book is devoted to OpenCL, so we’ll quote some extracts here from one of the chapters. The rest of the book is worth a read, so if you want to check the rest of the text, just fill in the form on Altera’s webpage},
keywords = {altera, fpgas for dummies, opencl},
review = {- Today, OpenCL is developed and maintained by the technology consortium Khronos Group. Most FPGA manufacturers provide Software Development Kits (SDKs) for OpenCL development on FPGAs.
- Free ebook FPGAs for dummies},
timestamp = {2019-06-11},
}
@Book{,
author = {Andrew Moore, Ron Wilson},
title = {FPGAs for Dummies},
date = {2017-01-09},
editor = {Intel},
isbn = {978-1-119-39049-7},
url = {https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/misc/fpgas_for_dummies_ebook.pdf},
abstract = {Field programmable gate arrays (FPGAs) are integrated cir-cuits that enable designers to program customized digital logic in the field. FPGAs have been around since the 1980s and were originally conceived to give all design teams the ability to create custom logic. In the early days, using an FPGA in your design meant you had to do a lot of programming just to get your FPGA to perform simple functions, so most design-ers avoided them. If you haven’t looked into FPGAs since your university studies way back when, you’ll want to take another look at them.The FPGA has evolved from a useful but humble interface device into a system-level integrated circuit (IC) with its own microprocessors, memory blocks, and interfaces. It’s the next big thing.Now would be a great time to get an inexpensive development kit, download free tools, and begin to explore this world for yourself. And this book will help you understand the practical uses of FPGAs.},
keywords = {fpga, intel, altera, dummies},
review = {- How FPGAs work
- The difference between FPGAs, ASSPs, and ASICs
- To use FPGAs as functional blocks in a system
- [FPGAs] enable you to build exactly the hardware you need
- [FPGAs can be customized meaning] that often […] you can do operations in a simpler, faster, more energy-efficient way.
- An FPGA is a semiconductor device on which the function can be defined after manufacturing
- "field programmable = programmable in the field", programmable sur le terrain, reconfigurable alors que le circuit intégré est installé sur un circuit imprimé
- constraints : cost, performance, power consumption, size, interface…
- OpenCL was first developped by Apple. Today, OpenCL is developed and maintained by the technology consortium Khronos Group.
- FPGAs are inherently parallel
- Single-device motor control: microcontrollers can fall short of the performance demands of sophisticated motor-control algorithms such as direct torque control (DTC) or sensorless field oriented control (SFOC)
- Television broadcasting: television broadcasters use a serial digital interface (SDI) standard to transmit uncompressed digital video. The latest standard is calld the 3-Gbps SDI and is capable of moving 4K ultraHD signals. FPGA solutions come with a core transceiver that can function on all three SDI rates (SD SDI, HD SDI and 3G-SDI) on the same transceiver.
- Wireless data: many FPGAs now come equipped with buil-in low-latency intellectual property (IP) for advanced networks as well as productivity enhancing tools to allow manufacturers to leverage FPGAs advantages of performance, power, price and productivity to focus their efforts on product differentiation.
- Automotive driver assistance cameras : traditional DSP processors or microcontrollers don't have the power to do real-time video processing and analytics at the same time. Moreover, HDR or high dynamic range, is a necessity for video analytics to be accurate. HDR processing can as much as triple the demand for video signal processing power, taking the performance requirements out of reach for all but the most expensive DSPs.
- High-performance computing: developping application-specific coprocessors. Latest Intel FPGAs build in not just DSP functions but floating-point hardware.},
timestamp = {2019-06-11},
}
@Electronic{,
author = {Wayne Wolf, Ahmed Amine Jerraya and Grant Martin},
editor = {IEEE},
title = {Multiprocessor System-on-Chip (MPSoC) Technology},
date = {2008-10-10},
url = {http://www.cs.unc.edu/~montek/teaching/Comp790-Fall11/Home/Home_files/2008Wolf.pdf},
abstract = {The multiprocessor system-on-chip (MPSoC) usesmultiple CPUs along with other hardware subsystems to imple-ment a system. A wide range of MPSoC architectures have beendeveloped over the past decade. This paper surveys the history ofMPSoCs to argue that they represent an important and distinctcategory of computer architecture. We consider some of the tech-nological trends that have driven the design of MPSoCs. We alsosurvey computer-aided design problems relevant to the design ofMPSoCs},
keywords = {Configurable processors, encoding, hardware/software codesign, multiprocessor, multiprocessor system-on-chip(MPSoC)},
review = {- [MPSoCs] are not simply traditional multiprocessors shrunk to a single chip but have been designed to fulfill theunique requirements of embedded applications
- MPSoCs have been in production for much longer than multicore processors.
- multicore <> MPSoCs in that they simply combine many same processors on a chip while MPSoCs combine heterogeneous processors with different requirements},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Zoltan Baruch, Octavian Creţ and Kalman Pusztai},
editor = {Technical University of Cluj-Napoca},
title = {Configurable processor},
date = {2002-05-25},
url = {https://www.researchgate.net/publication/229001961_CONFIGURABLE_PROCESSOR},
abstract = {Configurable architectures can deliver the high performance required by
computationally-demanding applications, similar to the ASIC circuits, while providing the
flexibility of the programmable processors. The performances achieved by these architectures
are often one or two orders of magnitude higher than those of processor-based alternatives. In
this paper we describe the design and implementation of a configurable processor. The proces-
sor consists of a constant part and a configurable structure. The constant part allows to solve
simple applications without changing the existing resources. The configurable part is defined by
the user, based on the requirements of a specific application. This part contains application-
specific functional blocks, controlled by special instructions. The integration of a classical proc-
essor and a configurable architecture within the same circuit allows to exploit the advantages of
both architectures. For the design of the configurable processor we used the VHDL language.
The implementation was performed using a Xilinx XCV600E FPGA device. This processor can
be used in several types of applications: data encryption and compression, image processing,
digital signal processing, special arithmetic.},
keywords = {Configurable computing, Configurable architectures, Reconfigurable devices, FPGA devices.},
review = {- The processor contains the kernel of a general-purpose processor and it can be extended in order to be used for specific application. This extension is accomplished by adding new instructions and functional units, without the change of the general-purpose kernel
- In traditional processors, operations are composed temporally by sequencing them in time, using registers or memory to store intermediate results.
- In configurable architectures, tasks are implemented by spatially composing primitive operators, that is, by linking them together by wires.
- FPGA devices can control operations at bit level, while processors can control their operators only at word level. As a result, processors often waste part of their computational capacity when operating on narrow-width data.},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Ricardo E. Gonzalez},
editor = {Stretch Inc},
title = {A Software Configurable Processor},
date = {2005-09-29},
url = {https://pdfs.semanticscholar.org/89ad/640a6e704844a0098278e79a8ef06a934c62.pdf},
keywords = {assp, asic, fpga, gpu, dsp, cpu, configurable computing},
review = {- Highest compute performance: ASSP/ASIC > FPGA/GPU > DSP > CPU
- Lowest system time-to-market: CPU/ASSP > DSP > FPGA > ASIC
- Software solution: ASSP, CPU, DSP, GPU
- Hardware solution: FPGA, ASIC
- Compute intensive markets & applications:
- military & security (sonar/radar, biometrics)
- medical electronics (CAT scan, ultrasound)
- office automation (printers, scanners)
- networking (encryption/decryption, network security)
- video & audio (broadcast equipment, audio studio)
- EEMBC, the Embedded Microprocessor Benchmark Consortium, develops and certifies real-world benchmarks and benchmark scores to help designers select the right embedded processors for their systems
- ISEF: Instruction Set Extension Fabric},
timestamp = {2019-06-12},
}
@WWW{,
author = {Warren Miller},
editor = {Electronic Engineering Times},
title = {Configurable processors as an alternative to FPGAs},
date = {2013-07-03},
url = {https://www.eetimes.com/author.asp?section_id=36&doc_id=1318804},
abstract = {An exploration of using configurable processors as an alternative to the traditional FPGA approach to creating a custom system.},
keywords = {configurable processor, fpga, isef},
review = {- Configurable processors can implement many compute oriented functions FPGAs can address but with some distinct advantages.
- Stretch is a microprocessor company that puts programmable fabric inside the processor.
- FPGA companies [puts] processors on FPGAs
- The ISEF is not made up of the familiar Look-Up-Tables (LUTs) seen in most FPGAs.
- [The ISEF] is a vast collection of ALUs, shift registers, and similar compute fabric elements.
- Ability for "pure" software designers to access configurable technology.
- Algorithms that are commonly implemented in the fabric can be moved into the dedicated programmable accelerator on the next generation device.
- FPGA implementations […] target much wider applications sets and must by their nature stay more generic.},
timestamp = {2019-06-12},
}
@WWW{,
author = {IBM Research Editorial Staff},
editor = {IBM Research Blog},
title = {IBM Scientists Demonstrate Mixed-Precision In-Memory Computing for the First Time; Hybrid Design for AI Hardware},
date = {2018-04-17},
url = {https://www.ibm.com/blogs/research/2018/04/ibm-scientists-demonstrate-mixed-precision-in-memory-computing-for-the-first-time-hybrid-design-for-ai-hardware/},
keywords = {in-memory computing, hybrid, ibm},
review = {- The fundamental design of today’s computers, which are based on the von Neumann architecture, [requires] data to be shuttled back and forth at high speeds, an inefficient process.
- The clear answer is to transition to a non-von Neumann architecture in which memory and processing coexist in some form.
- in-memory computing [uses] nanoscale resistive memory devices, organized in a computational memory unit, for both processing and memory.
- called mixed-precision in-memory computing […] combines a von Neumann machine with a computational memory unit
- In this hybrid design, the computational memory unit performs the bulk of the computational tasks, whereas the von Neumann machine implements a method to iteratively improve or refine the accuracy of the solution},
timestamp = {2019-06-12},
}
@WWW{,
author = {IBM Research Editorial Staff},
editor = {IBM Research Blog},
title = {IBM Scientifs Demonstrate In-memory Computing with 1 Million Devices for Applications in AI},
date = {2017-10-24},
url = {https://www.ibm.com/blogs/research/2017/10/ibm-scientists-demonstrate-memory-computing-1-million-devices-applications-ai/},
keywords = {in-memory computing, computational memory, ibm, phase change memory},
review = {- “In-memory computing” or “computational memory” is an emerging concept that uses the physical properties of memory devices for both storing and processing information.
- Phase Change Memory},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Abu Sebastian, Tomas Tuma, Nikolaos Papandreou, Manuel Le Gallo, Lukas Kull, Thomas Parnell \& Evangelos Eleftheriou},
editor = {Nature communications},
title = {Temporal correlation detection using computational phase-change memory},
date = {2017-10-24},
url = {https://www.nature.com/articles/s41467-017-01481-9},
abstract = {Conventional computers based on the von Neumann architecture perform computation by repeatedly transferring data between their physically separated processing and memory units. As computation becomes increasingly data centric and the scalability limits in terms of performance and power are being reached, alternative computing paradigms with collocated computation and storage are actively being sought. A fascinating such approach is that of computational memory where the physics of nanoscale memory devices are used to perform certain computational tasks within the memory unit in a non-von Neumann manner. We present an experimental demonstration using one million phase change memory devices organized to perform a high-level computational primitive by exploiting the crystallization dynamics. Its result is imprinted in the conductance states of the memory devices. The results of using such a computational memory for processing real-world data sets show that this co-existence of computation and storage at the nanometer scale could enable ultra-dense, low-power, and massively-parallel computing systems.},
keywords = {phase-change memory},
timestamp = {2019-06-12},
}
@WWW{,
author = {Scott Thornton},
editor = {Microcontroller Tips},
title = {What's the difference between Von-Neumann and Harvard architectures?},
date = {2018-03-08},
url = {https://www.microcontrollertips.com/difference-between-von-neumann-and-harvard-architectures/},
keywords = {architecture, von neumann, harvard, bottleneck},
review = {- In a Von-Neumann architecture, the same memory and bus are used to store both data and instructions
- The Harvard architecture stores machine instructions and data in separate memory units that are connected by different busses.
- Modern processors might share memory but have mechanisms like special instructions that keep data from being mistaken for code: “modified Harvard architecture.”. The memory controller is where the modification is seated, since it handles the memory and how it is used.
- A typical computation with a Von-Neumann architecture is:
- read code
- read memory
- compute
- write memory
- The Von Neumann bottleneck occurs when data taken in or out of memory must wait while the current memory operation is completed
- The Von Neumann bottleneck has increased over time because processors have improved in speed while memory has not progressed as fast.
- It is interesting to note that speculative execution is the conduit for one of the latest security flaws discovered by Google Project Zero, named Spectre.},
timestamp = {2019-06-12},
}
@WWW{,
author = {ARM Technical Support Knowledge Articles},
editor = {ARM Limited},
title = {What is the difference between a von Neumann architecture and a Harvard architecture?},
date = {2008-09-09},
url = {http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka3839.html},
keywords = {von neumann, harvard, architecture},
review = {- Using a simple, unified memory system together with a Harvard architecture is highly inefficient.
- Unless it is possible to feed data into both busses at the same time, it might be better to use a von Neumann architecture processor.},
timestamp = {2019-06-12},
}
@WWW{,
author = {VAndrei},
editor = {StackOverflow},
title = {Von Neumann vs Harvard architecture},
date = {2014-11-09},
url = {https://stackoverflow.com/questions/26826248/von-neumann-vs-harvard-architecture},
keywords = {von neumann, harvard, architecture},
review = {- Why isn't a pure Harvard architecture adopted for PC's? L1 Instruction Cache miss ratio is very small. This means that generally code size is not a problem. So it wouldn't make sense to design a fully separate path for code. Data can grow very large but code can't really.
- In DSP's it makes sense to use separate code and data paths. That's because DSP's work mainly on "streaming data" meaning that the need for caching is rather small. Also the DSP codes can contain pre-computed coefficients that increase the code size. So there is a balance between data size and code size, meaning that it makes sense using a Harvard architecture.},
timestamp = {2019-06-12},
}
@WWW{,
author = {Victor Eijkhout},
editor = {Quora},
title = {Are there alternatives to the Von Neumann architecture?},
date = {2016-01-10},
url = {https://www.quora.com/Are-there-alternatives-to-the-Von-Neumann-architecture},
keywords = {alternative, von neumann, dataflow},
review = {- The Von Neumann architecture is focused on instructions
- It makes the assumption that data can be found and created faster than the instruction execution rate. This used to be true at some point, when data could be transferred in one cycle and processors took several cycles for an instruction.
- This has not been the case for 2 decades or so. As a result processors are now reasoning about instructions.
- Dataflow has plenty of interesting ideas, but the notion that a compiler could do dataflow analysis on a regular programming language has proved to be a complete chimera.
- explicit dataflow programming is very hard
- Dataflow is used inside your CPU, completely realized in hardware; it is the basis for out-of-order execution, and is used as a mechanism to hide latency},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Matthias Fouquet-Lapar},
editor = {SGI},
title = {The von Neumann Architecture and Alternatives},
date = {2008-05-16},
url = {https://wwz.ifremer.fr/pcdm/content/download/29481/407627/file/mfl.pdf},
keywords = {von neumann, bottleneck, xpp, hpc, fpga, gpu, ibm cell},
review = {- Physical semi-conductor limits
- clock frequency
- heat
- etching below 10 nm
- Von Neumann bottleneck: cache memory is used to hide memory latencies, hyperthreading allows additional hiding of memory references by switching to a different thread
- Caches occupy more and more places on the die
- Additional transistors can be used to add cores on the same die
- The ONLY problem is your application/workflow because it has to be parallelized
- Maybe you also need higher performance, better silicon utilization, less power consumption, better price/performance
- The way and degree of parallelization is application specific (one solution cannot fit them all, solution is hybrid)
- 1998, Tensor Processing Unit (TPU): hybrid computational model, adaptive in execution, library-based programming model
- TPU is deeply pipelined and heavily chained vector operations
- TPU, some datapaths are reconfigurable
- TPU for radar, medical imaging (CT), image processing (compression, decompression, filtering)
- technologies for hybrid acceleration: FPGA, GPU, ClearSpeed, PACT XPP, multi-core CPUs, IBM Cell
- PACT XPP, from von-Neumann instruction to configuration/data flow
- Not every Accelerator Technology is applicable to every HPC Problem – and it’s a moving target
- Not every HPC Application can be accelerated
- a solution has to be price/performance competitive.
- There should be at least a 10-20X speedup compared to current top-end CPUs},
timestamp = {2019-06-12},
}
@WWW{,
author = {Various persons},
editor = {StackOverflow},
title = {What are some examples of non-Von Neumann architectures?},
date = {2012-11-12},
url = {https://stackoverflow.com/questions/1806490/what-are-some-examples-of-non-von-neumann-architectures},
keywords = {von neumann, architecture, harvard, dataflow, reduction, cellular automata, quantum computing},
review = {- Harvard architecture
- Modified Harvard architecture
- Dataflow machines
- Reduction machines
- Cellular automata
- Quantum computing / Quantum Turing machine
- Problem dependent machines},
timestamp = {2019-06-12},
}
@Electronic{,
author = {E. Schüler, Tim Helfers},
editor = {PACT, Astrium},
title = {XPP - eXtreme Processing Platform Technology for space applications},
date = {2001-09-20},
url = {http://spacewire.esa.int/WG/Data-Systems/OPDP-proceedings/Presentations/session%20II.B/3_Astrium_XPP_Helfers.pdf},
keywords = {xpp, architecture, von neumann, fpga},
review = {- XPP reconfigurable processing cores
- conventional processors use the sequential model, each operation takes one clock cycle, multiple operations are computed consecutively
- Multiple computations are mapped as code sections onto a two dimensional array (flow graph mapping)
- The code sections are mapped directly onto the processing array
- Von Neumann architecture = instructions
- XPP architecture = configuration
- Other technologies: VLIW architectures, Multi DSP cores, Reconfigurable processors, FPGA
- comparison between multi-DSP cores, processor applications in FPGAs and reconfigurable coprocessors},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Katherine Compton, Scott Hauck},
editor = {ACM Computing Surveys},
title = {Reconfigurable computing: a survey of systems and software},
date = {2002-05-14},
url = {https://people.ece.uw.edu/hauck/publications/ConfigCompute.pdf},
abstract = {Due to its potential to greatly accelerate a wide variety of applications, reconfigurable computing has become a subject of a great deal of research. Its key feature is the ability to perform computations in hardware to increase performance, while retaining much of the flexibility of a software solution. In this survey, we explore the hardware aspects of reconfigurable computing machines, from single chip architectures to multi-chip systems, including internal structures and external coupling. We also focus on the software that targets these machines, such as compilation tools that map high-leve lalgorithms directly to the reconfigurable substrate. Finally, we consider the issues involved in run-time reconfigurable systems, which reuse the configurable hardware during program execution},
keywords = {reconfigurable computing, fpga, architecture},
review = {- Reconfigurable computing as a concept has been in existence for quite some time [Estrin et al. 1963]
- The recent advances in reconfigurable computing are for the most part derived from the technologies developed for FPGAs in the mid-1980s
- FPGAs were originally created to serve as a hybrid device between PALs and Mask-Programmable Gate Arrays (MPGAs).
- The flexibility, capacity,and performance of [FPGAs] has opened up completely new avenues in high-performance computation, forming the basis of reconfigurable computing.
- when the percentage of logic blocks used in an FPGA becomes very high, automatic routing tools frequently have difficulty achieving the necessary connections between the blocks
- Programmable logic tends to be inefficientat implementing certain types of operations, such as variable-length loops andbranch control.
- a reconfigurable unit may be used
* to provide reconfigurable functional units within a host processor,
* as a coprocessor
* as an attached reconfigurable processing unit
* as an external stand-alone processing unit
- Since the introduction of FPGAs in the mid-1980s, there have been many different investigations into what computation element(s) should be built into the array
- These structures, commonly called logic blocks or cells, vary in complexity
- Because multiplication is one of themore difficult computations to implement efficiently in a traditional FPGA structure, the custom multiplication hardware embedded within a reconfigurable arrayallows a system to perform even that function well
- Altera has demonstrated a preliminary ARM9-based Excalibur device, which combines reconfigurable hardware with an embedded ARM9 processor core [Altera 2001]
- for FPGAs, high-LUT utilization may not necessarily be the most desirable situation, but rather efficient routing usage may be of more importance. This is because the routing resources occupy a much larger part of the area of an FPGA than the logic resources, and therefore the most area efficient designs will be those that optimize their use of the routing resources rather than the logic resources.},
timestamp = {2019-06-12},
}
@WWW{,
author = {Kaz Sato, Cliff Young, David Patterson},
editor = {Google},
title = {An in-depth look at Google’s first Tensor Processing Unit (TPU)},
date = {2017-05-12},
url = {https://cloud.google.com/blog/products/gcp/an-in-depth-look-at-googles-first-tensor-processing-unit-tpu},
abstract = {There’s a common thread that connects Google services such as Google Search, Street View, Google Photos and Google Translate: they all use Google’s Tensor Processing Unit, or TPU, to accelerate their neural network computations behind the scenes.},
keywords = {tpu, tensor processing unit, google, architecture, reconfigurable computing},
review = {- In short, we found that the TPU delivered 15–30X higher performance and 30–80X higher performance-per-watt than contemporary CPUs and GPUs
- Google considered building an Application-Specific Integrated Circuit (ASIC) for neural networks as early as 2006
- We chose to package the processor as an external accelerator card that fits into an SATA hard disk slot for drop-in installation.
- The TPU is connected to its host via a PCIe Gen3 x16 bus that provides 12.5GB/s of effective bandwidth.
- Quantization is a powerful tool for reducing the cost of neural network predictions
- Being able to use integer rather than floating point operations greatly reduces the hardware footprint and energy consumption of our TPU
- A TPU contains 65,536 8-bit integer multipliers
- [The TPU is] designed to be flexible enough to accelerate the computations needed to run many different kinds of neural network models.
- We chose the Complex Instruction Set Computer (CISC) style as the basis of the TPU instruction set instead
- The heart of the TPU: A systolic array
- CPUs and GPUs often spend energy to access multiple registers per operation. A systolic array chains multiple ALUs together, reusing the result of reading a single register.
- It makes an engineering tradeoff: limiting registers, control and operational flexibility in exchange for efficiency and much higher operation density.
- a TPU can process 65,536 multiply-and-adds for 8-bit integers every cycle. Because a TPU runs at 700MHz, a TPU can compute 65,536 × 700,000,000 = 46 × 1012 multiply-and-add operations or 92 Teraops per second (92 × 1012) in the matrix unit
- A TPU has none of the sophisticated microarchitectural features that consume transistors and energy to improve the average case but not the 99th-percentile case: no caches, branch prediction, out-of-order execution, multiprocessing, speculative prefetching, address coalescing, multithreading, context switching and so forth.
- Minimalism is a virtue of domain-specific processors.},
timestamp = {2019-06-12},
}
@WWW{,
author = {Jeff Dean, Urs Hölzle},
editor = {Google},
title = {Build and train machine learning models on our new Google Cloud TPUs},
date = {2017-05-17},
url = {https://blog.google/products/google-cloud/google-cloud-offer-tpus-machine-learning/},
keywords = {tpu, tensor processing unit, google, reconfigurable computing, ai},
review = {- Each of these new TPU devices delivers up to 180 teraflops of floating-point performance
- A TPU pod contains 64 second-generation TPUs and provides up to 11.5 petaflops to accelerate the training of a single large machine learning model
- Shazam recently announced that they successfully migrated major portions of their music recognition workloads to NVIDIA GPUs on Google Cloud and saved money while gaining flexibility.},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Arthur H. Veen},
editor = {ACM Computing Surveys},
title = {Dataflow machine architecture},
date = {1986-12-31},
url = {https://www.researchgate.net/publication/220566271},
abstract = {Dataflow machines are programmable computers of which the hardware is optimized for fine-grain data-driven parallel computation. The principles and complications of data-driven execution are explained, as well as the advantages and costs of fine-grain parallelism. A general model for a dataflow machine is presented and the major design options are discussed.Most dataflow machines described in the literature are surveyed on the basis of this model and its associated technology. For general-purpose computing the most promising dataflow machines are those that employ packet-switching communication and support general recursion. Such a recursion mechanism requires an extremely fast mechanism to map a sparsely occupied virtual space to a physical space of realistic size. No solution has yet proved fully satisfactory.A working prototype of one processing element is described in detail. On the basis of experience with this prototype, some of the objections raised against the dataflow approach are discussed. It appears that the overhead due to fine-grain parallelism can be made acceptable by sophisticated compiling and employing special hardware for the storage of data structures. Many computing-intensive programs show sufficient parallelism. In fact, a major problem is to restrain parallelism when machine resources tend to get overloaded. Another issue that requires further investigation is the distribution of computation and data structures over the processing elements.},
keywords = {dataflow, data-driven computing, architecture},
review = {- data-driven parallel computing
- the efficiency of a parallel computer is influenced by several conflicting factors: contention (for a shared resource, usually shared memory or some other communication channel), scalability (property that the performance of the machine can always be improved by adding more processing elements)
- In dataflow machines scheduling is based on availability of data; this is called data-driven execution.},
timestamp = {2019-06-12},
}
@Electronic{,
author = {David H. Jones, Adam Powell, Christos-Savvas Bouganis, Peter Y. K. Cheung},
editor = {Imperial College London},
title = {GPU versus FPGA for high productivity computing},
date = {2010-08-03},
url = {http://cas.ee.ic.ac.uk/people/ccb98/papers/DavidFPL10.pdf},
abstract = {Heterogeneous or co-processor architectures arebecoming an important component of high productivity computing systems (HPCS). In this work the performance of a GPU based HPCS is compared with the performance of a commercially available FPGA based HPC. Contrary to previous approaches that focussed on specific examples, a broader analysis is performed by considering processes at an architectural level. A set of benchmarks is employed that use different process architectures in order to exploit the benefits of each technology. These include the asynchronous pipelines common to “map” tasks, a partially synchronous tree common to “reduce” tasks and a fully synchronous, fully connected mesh. We show that the GPU is more productive than the FPGA architecture for most of the benchmarks and conclude that FPGA-based HPCS is being marginalised by GPUs.},
keywords = {hpcs, high productivity computing system, architecture, gpu, fpga, comparison},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Ratheesh Kalarot and John Morris},
editor = {The University of Auckland / IEEE},
title = {Comparison of FPGA and GPU implementations of Real-time Stereo Vision},
date = {2010-05-19},
url = {https://www.researchgate.net/profile/John_Morris25/publication/224165460_Comparison_of_FPGA_and_GPU_implementations_of_real-time_stereo_vision/links/0f317539b0c42b50be000000.pdf},
abstract = {Real-time stereo vision systems have many applications -from autonomous navigation for vehicles through surveillance to materials handling. Accurate scene interpretation depends on an ability to process high resolution images in real-time, but, although the calculations for stereo matching are basically simple, a practical system needs to evaluate at least 10-9 disparities every second - beyond the capability of a single processor. Stereo correspondence algorithms have high degrees of inherent parallelism and are thus good candidates for parallel implementations. In this paper, we compare the performance obtainable with an FPGA and a GPU to understand the trade-off between the flexibility but relatively low speed of an FPGA and the high speed and fixed architecture of the GPU. Our comparison highlights the relative strengths and limitations of the two systems. Our experiments show that, for a range of image sizes, the GPU manages 2×10-9 disparities per second, compared with 2.6×10-9 disparities per second for an FPGA.},
keywords = {gpu, fpga, comparison},
review = {- For this application at least, the FPGA implementation issuperior, despite a much slower internal clock.},
timestamp = {2019-06-12},
}
@Electronic{,
author = {Jeff Chase, Brent Nelson, John Bodily, Zhaoyi Wei, and Dah-Jye Lee},
editor = {Brigham Young University},
title = {Real-Time Optical Flow Calculations on FPGA and GPU Architectures: AComparison Study},
date = {2009-04-30},
url = {https://www.researchgate.net/profile/Lee_Dah-Jye/publication/224362818_Real-Time_Optical_Flow_Calculations_on_FPGA_and_GPU_Architectures_A_Comparison_Study/links/0c9605327135c229e0000000.pdf},
abstract = {FPGA devices have often found use as higher-performance alternatives to programmable processors for implementing a variety of computations. Applications successfully implemented on FPGAs have typically contained high levels of parallelism and have often used simple statically-scheduled control and modest arithmetic. Recently introduced computing devices such as coarse grain reconfigurable arrays, multi-core processors, and graphical processing units (GPUs) promise to significantly change the computational landscape for the implementation of high-speed real-time computing tasks. One reason for this is that these architectures take advantage of many of the same application characteristics that fit well on FPGAs. One real-time computing task, optical flow, is difficult to apply in robotic vision applications in practice because of its high computational and data rate requirements, and so is a good candidate for implementation on FPGAs and other custom computing architectures. In this paper, a tensor-based optical flow algorithm is implemented on both an FPGA and a GPU and the two implementations discussed. The two implementations had similar performance, but with the FPGA implementation requiring 12×more development time. Other comparison data for these two technologies is then given for three additional applications taken froma MIMO digital communication system design, providing additional examples of the relative capabilites of these two technologies.},
keywords = {fpga, gpu, comparison},
review = {- The results were mixed with similar per-formance provided by both FPGA and GPU platforms forthe optical flow and trellis computations and better performance provided by the FPGA for the pilot detector and timing and channel estimator computations.
- FPGAs possess unrivaled flexbility for combining custom I/O with computation.
- GPUs seem to be more sensitive to compute-to-I/O ratio than FPGAs.},
timestamp = {2019-06-12},
}
@WWW{,
author = {Taylor IoT Kidd},
editor = {Intel Developer Zone},
title = {Why P scales a C*V^2*f is so obvious},
date = {2009-06-29},
url = {https://software.intel.com/en-us/blogs/2009/06/29/why-p-scales-as-cv2f-is-so-obvious},
keywords = {power, energy, consumption, cmos, physics},
review = {- Explanation of the formula},
timestamp = {2019-06-14},
}
@WWW{,
author = {Taylor IoT Kidd},
editor = {Intel Developer Zone},
title = {Why P scales as C*V^2*f is so obvious (pt 2)},
date = {2015-01-01},
url = {https://software.intel.com/en-us/blogs/2009/08/25/why-p-scales-as-cv2f-is-so-obvious-pt-2-2},
keywords = {power, energy, consumption, cmos, physics},
timestamp = {2019-06-14},
}
@WWW{,
editor = {ViPress.net},
title = {Infineon et NXP devant STMicroelectronics au 1er trimestre 2019},
date = {2019-05-17},
url = {https://www.vipress.net/infineon-et-nxp-devant-stmicroelectronics-au-1er-trimestre-2019/},
abstract = {Après avoir déjà dépassé Samsung au 4e trimestre 2018, Intel renforce ainsi son avance sur le Coréen de près de 3 milliards de dollars au 1er trimestre 2019, selon IC Insights. Intel a détrôné Samsung en tant que premier fournisseur de semiconducteurs au 4e trimestre 2018 après avoir perdu sa place au profit de Samsung au 2e trimestre 2019. Alors que Samsung occupait le premier rang du classement pour l’ensemble de 2017 et de 2018, Intel devrait reprendre facilement le premier rang pour l’année entière de 2019, poste qu’il occupait auparavant de 1993 à 2016. Le retournement des marchés des mémoires Drams et flash NAND au cours de l’année écoulée explique ce basculement. Au premier trimestre 2018, les ventes totales de semiconducteurs de Samsung étaient supérieures de 23% à celles d’Intel ; au 1er trimestre 2019, c’est l’inverse : celles d’Intel dépassent celles du Coréen de 23% !},
keywords = {semi-conducteur, classement, intel, samsung, ventes},
timestamp = {2019-06-14},
}
@Electronic{,
editor = {Conrad},
title = {Kit d’apprentissage de l’électronique pour débutants},
date = {2009-07-31},
url = {https://produktinfo.conrad.com/datenblaetter/175000-199999/192230-an-01-fr-LERNPAKET_25_ELEKTRONIK_EXPERIMENTE.pdf},
keywords = {électronique, composants, conrad, circuit},
timestamp = {2019-06-15},
}
@Electronic{,
author = {Daniel Francisco Gómez Prado},
editor = {University of Massachusetts},
title = {Tutorial on FPGA routing},
date = {2006-08-31},
url = {http://sisbib.unmsm.edu.pe/bibvirtualdata/publicaciones/electronica/n17_2006/a04.pdf},
abstract = {The entire CAD process that is necessary to implement a circuit in an FPGA (from the RTL description of the design) consists of the following steps: •Logic optimization. Performs two-level or multi-level minimization of the Boolean equations to optimize area, delay, or a combination of both. •Technology mapping. Transforms the Boolean equations into a circuit of FPGA logic blocks. This step also optimizes the total number of logic blocks required (area optimization) or the number of logic blocks in time-critical paths (delay optimization). •Placement. Selects the specific location for each logic block in the FPGA, while trying to minimize the total length of interconnect required. •Routing. Connects the available FPGA’s routing resources1 with the logic blocks distributed inside the FPGA by the placement tool, carrying signals from where they are generated to where they are used.},
keywords = {routing, fpga, model},
review = {- The C [Connection] boxes connect the channel wires with the input and output pins of the CLBs
- The S [Switch] boxes allow wires to switch between vertical and horizontal wires
- Switch boxes that allow connection to any other domain are called Wilton switch boxes, and they are broadly used as they provide greater flexibility on routing
- [Single-length lines] are intended for relatively short connections among CLBs and they span through one CLB only
- [Double-length lines] are similar to the Single-length lines, except that each one spans two CLBs, offering lower routing delays for moderately long connection
- [Long lines] are appropriate for connections that require reaching several CLBs with low-skew.
- Increasing the flexibility of the switch box, the connection box and the number of wires per channel makes routing a trivial problem [17] as all possible interconnections are available. But increasing routing resources has the drawback that waste area and transistors in the FPGA, as only a fraction of those resources will be used for a given design, even worse it increases the number of interconnect transistors which are the principal reason of delay on FPGAs},
}
@WWW{,
author = {Marek Vašut},
editor = {DENX Software Engineering, The Linux Foundation},
title = {Open-Source Tools for FPGA Development},
date = {2016-10-11},
url = {https://www.youtube.com/watch?v=MI18Wk4gxA4},
abstract = {Programmable hardware is becoming increasingly popular in the recent years, yet the software tools for working with such programmable hardware are dominated by closed-source proprietary solutions. This is now changing. In this presentation, Marek will summarize the open-source tools for working with programmable hardware, like "icestorm", "vtr", "ghdl" and "iverilog". Marek will show how to use the open-source tools to produce a working design and explain the benefits and limitations of such solutions. At the end of the talk, Marek will outline the process of implementing such tools to demonstrate why this is so much effort.},
keywords = {open source, fpga, tools, development},
timestamp = {2019-06-19},
}
@Electronic{,
author = {Aaron Arenas},
editor = {Intel},
title = {Introduction to FPGA design in Quartus},
date = {2018-08-31},
url = {https://fpgawiki.intel.com/uploads/0/07/Intro_to_FPGA_Workshop_Slides.pdf},
abstract = {FPGAs at Intel
Fundamentals of Digital Electronics
FPGA Architecture
Intel® Quartus® Prime Design Software
FPGA Design Flow},
keywords = {fpga, quartus, intel},
timestamp = {2019-06-20},
}
@Electronic{,
editor = {Altera, Intel},
title = {Cyclone V Device Datasheet},
date = {2019-01-25},