back

koyumi0601 · Mar 12, 2024 · d76e648 · d76e648
1 parent a474475
commit d76e648
Show file tree

Hide file tree

Showing 304 changed files with 20,953 additions and 125 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1939,3 +1939,31 @@ _posts/Cuda/practice_045_captureMonitorPdfGeneration/document/latex/refman.tex
 _posts/Cuda/practice_045_captureMonitorPdfGeneration/document/latex/s2.png
 _posts/Cuda/practice_045_captureMonitorPdfGeneration/document/latex/small_rabit.png
 _posts/Cuda/practice_045_captureMonitorPdfGeneration/document/latex/tabu_doxygen.sty
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/vcpkg.applocal.log
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/vc143.pdb
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/vc143.idb
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/utils.obj
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/main.obj
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/kernel.cu.obj
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/kernel.cu.cache
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/kernel.cu-629993211.deps
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.vcxproj.FileListAbsolute.txt
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/link.write.2u.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/link.write.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/link.read.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/link.command.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/examples.lastbuildstate
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/CudaCompile.write.1u.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/CudaCompile.read.1u.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/CL.write.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/CL.read.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/Cl.items.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.tlog/CL.command.1.tlog
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.pdb
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.log
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.lib
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.ilk
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.exp
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.exe.recipe
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/examples.exe
+_posts/Cuda/practice_046_cuda_book_examples/examples/x64/Debug/example01.obj
diff --git a/_posts/Cuda/2023-12-28-cuda-env.md b/_posts/Cuda/2023-12-28-cuda-env.md
@@ -78,11 +78,12 @@ cntrl + shift + B
 ./my_cuda_program
 ```
 
-- 테스트 코드
+- 테스트 코드: cudaDeviceProp 구조체를 이용해서 property 가져오기
 
 ```c
 #include <iostream>
 #include <cuda_runtime.h>
+#include <helper_cuda.h> // _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)
 
 int main() {
     int deviceCount;
@@ -92,11 +93,12 @@ int main() {
         cudaDeviceProp deviceProp;
         cudaGetDeviceProperties(&deviceProp, deviceIdx);
 
-        std::cout << "Device " << deviceIdx << ": " << deviceProp.name << std::endl;
-        std::cout << "  Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
-        std::cout << "  Multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
+        std::cout << "Device " << deviceIdx << ": " << deviceProp.name << std::endl; // char[255]
+        std::cout << "  Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; // compute capability 주 버전. 마이너 버전. 둘 다 int
+        std::cout << "  Multiprocessors: " << deviceProp.multiProcessorCount << std::endl; // SM 갯수
         std::cout << "  CUDA Cores per Multiprocessor: " << deviceProp.warpSize << std::endl;
-        std::cout << "  Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
+        std::cout << "  CUDA cores: " << _ConvertSMVer2Cores(deviceProp.major, devProp.minor * deviceProp.multiProcessorCount);
+        std::cout << "  Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl; // GPU의 global(device) 메모리 크기. 단위 byte
         std::cout << "  Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KB" << std::endl;
         std::cout << "  Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
         std::cout << "  Max Threads per Dimension: (" << deviceProp.maxThreadsDim[0] << ", "
@@ -110,6 +112,7 @@ int main() {
 }
 ```
 
+# GTX 960
 ```c
 Device 0: NVIDIA GeForce GTX 960
   Compute Capability: 5.2
@@ -123,4 +126,18 @@ Device 0: NVIDIA GeForce GTX 960
   Tensor Core 지원: No
 ```
 
+# RTX 3060
+```c
+Device 0: NVIDIA GeForce RTX 3060
+  Compute Capability: 8.6
+  Multiprocessors: 28
+  CUDA Cores per Multiprocessor: 32
+MapSMtoCores for SM 8.168 is undefined.  Default to use 128 Cores/SM
+  CUDA cores: 128  Global Memory: 12287 MB
+  Shared Memory per Block: 48 KB
+  Max Threads per Block: 1024
+  Max Threads per Dimension: (1024, 1024, 64)
+  Max Grid Size: (2147483647, 65535, 65535)
+```
+
 - Ray tracing, 메모리 버스 너비, 클럭 속도, AI 및 machine learning 지원 여부 확인 요망
diff --git a/_posts/Cuda/2024-01-03-cuda-study.md b/_posts/Cuda/2024-01-03-cuda-study.md
@@ -20,10 +20,11 @@ search: true
 - **Programming Guide** [https://docs.nvidia.com/cuda/cuda-c-programming-guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide)
 - Quick Start Guide [https://docs.nvidia.com/cuda/cuda-quick-start-guide/](https://docs.nvidia.com/cuda/cuda-quick-start-guide/)
 - CUDA Programming read docs. [https://cuda.readthedocs.io/ko/latest/rtx2080Ti/](https://cuda.readthedocs.io/ko/latest/rtx2080Ti/)
+- 구글검색 nvidia ampere architecture white paper [https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf](https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf)
 
 
 # Code 
-- Samples: NVIDIA Github 
+- Samples: NVIDIA Github [https://github.com/NVIDIA/cuda-samples](https://github.com/NVIDIA/cuda-samples)
 - 자주 쓰는 코드: https://blog.naver.com/PostView.nhn?blogId=lithium81&logNo=80143506571
 
 
@@ -62,27 +63,58 @@ search: true
 
 ## 구조
 
+# 6.1.1 스트리밍 멀티프로세서
+- 페르미 아키텍처: 하나의 SM에 32개의 CUDA core를 가지고 있다
+
+## Instruction Cache
+- 명령어 1개가 워프 내 모든 스레드에 사용된다. SIMT (Single Instruction Multi Treads)
+
+## Warp Scheduler
+- 다음에 처리할 명령어를 결정하거나 명령을 내린다
+## Dispatch Unit
+- 명령어 전달 유닛
+
+
+## **Register File**
+- 32768 x 32 bit
+- CORE들은 레지스터를 등분해서 가지고 있는다. 자신만의 작업책상이 있다.
+- 따라서 문맥 교환 비용이 0이다.
+
+## **CUDA Core**
+- 간단히는, cuda core하나가 스레드 하나를 처리한다고 볼 수 있다
+### Dispatch port
+### Operand Collector
+### FP Unit
+- 실제 연산 유닛
+### Int Unit
+- 실제 연산 유닛
+### Result Queue
+## LD/ST
+## SFU
+## Interconnect Network
+## 64kB **shared memory** / **L1 cache**
+
+## Uniform cache
+
+# 6.2 CUDA 스레드 계층과 GPU 하드웨어
+- 스레드 -> 워프 -> 스레드 블록 -> 그리드 -> GPU
+- 그리드 1개 = GPU 1개.
+  - 멀티 GPU라고 하더라도 그리드 1개는 GPU 1개에 대응된다.
+  - 반면, GPU 내에서 여러 개의 그리드는 실행 가능하다.
+
 - https://www.youtube.com/watch?v=gSgZNdT9414 33:54
 
 | 항목            | CUDA               | HW               |
-|----------------|--------------------|------------------|
-| 연산 단위        | Thread             | SP or CUDA Core  |
-| HW 점유 단위     | Block              | SM               |
+|-----------------|--------------------|------------------|
+| 연산 단위       | Thread             | SP or CUDA Core  |
+| HW 점유 단위    | Block              | SM               |
 | Shared Memory  | 48KB `__shared__`  | L1 Cache (16KB + 4KB) |
 | Barrier        | `__syncthreads()`  | -                |
 | 언어            | C/C++, ptr 사용 가능 | -                |
 
-<<<<<<< HEAD
-- grid = GPU
-- 
-
 
-# Code Samples
-
-- NVIDIA Github : [https://github.com/NVIDIA/cuda-samples](https://github.com/NVIDIA/cuda-samples)
-=======
 ![img](https://t1.daumcdn.net/cfile/tistory/16282136509A061507)
->>>>>>> 13085658d6d806b0d39f3a0e058dcecde4eba73e
+
 
 
 
@@ -121,3 +153,60 @@ search: true
 
 
 
+# 부동소수점 연산
+- IEEE 754 표준의 부동소수점 표현방식
+- [31] sign [30-23] exponent [22-0] fraction
+## 허용오차 10-5
+## float -> double
+- 허용 오차와 함께 사용
+## API
+| Function Name         | Operation                                  | Description                                                         |
+|-----------------------|--------------------------------------------|---------------------------------------------------------------------|
+| `__fadd_rn(x, y)`     | `x + y` (Float, Round to Nearest)          | 가장 가까운 값으로 라운딩하여 더하기 연산을 수행합니다.                  |
+| `__fadd_rz(x, y)`     | `x + y` (Float, Round Towards Zero)        | 0 방향으로 라운딩하여 더하기 연산을 수행합니다.                          |
+| `__fadd_ru(x, y)`     | `x + y` (Float, Round Up)                  | 무조건 올림하여 더하기 연산을 수행합니다.                               |
+| `__fadd_rd(x, y)`     | `x + y` (Float, Round Down)                | 무조건 내림하여 더하기 연산을 수행합니다.                               |
+| `__fmul_rn(x, y)`     | `x * y` (Float, Round to Nearest)          | 가장 가까운 값으로 라운딩하여 곱하기 연산을 수행합니다.                   |
+| `__fmul_rz(x, y)`     | `x * y` (Float, Round Towards Zero)        | 0 방향으로 라운딩하여 곱하기 연산을 수행합니다.                          |
+| `__fmul_ru(x, y)`     | `x * y` (Float, Round Up)                  | 무조건 올림하여 곱하기 연산을 수행합니다.                                |
+| `__fmul_rd(x, y)`     | `x * y` (Float, Round Down)                | 무조건 내림하여 곱하기 연산을 수행합니다.                                |
+| `__fmaf_rn(x, y, z)`  | `x * y + z` (Float, Round to Nearest)      | 가장 가까운 값으로 라운딩하여 곱한 후 더하기 연산을 수행합니다.           |
+| `__fmaf_rz(x, y, z)`  | `x * y + z` (Float, Round Towards Zero)    | 0 방향으로 라운딩하여 곱한 후 더하기 연산을 수행합니다.                   |
+| `__fmaf_ru(x, y, z)`  | `x * y + z` (Float, Round Up)              | 무조건 올림하여 곱한 후 더하기 연산을 수행합니다.                         |
+| `__fmaf_rd(x, y, z)`  | `x * y + z` (Float, Round Down)            | 무조건 내림하여 곱한 후 더하기 연산을 수행합니다.                         |
+| `__frcp_rn(x)`        | `1/x` (Float, Round to Nearest)            | 가장 가까운 값으로 라운딩하여 역수를 계산합니다.                          |
+| `__frcp_rz(x)`        | `1/x` (Float, Round Towards Zero)          | 0 방향으로 라운딩하여 역수를 계산합니다.                                 |
+| `__frcp_ru(x)`        | `1/x` (Float, Round Up)                    | 무조건 올림하여 역수를 계산합니다.                                       |
+| `__frcp_rd(x)`        | `1/x` (Float, Round Down)                  | 무조건 내림하여 역수를 계산합니다.                                       |
+| `__fdiv_rn(x, y)`     | `x / y` (Float, Round to Nearest)          | 가장 가까운 값으로 라운딩하여 나누기 연산을 수행합니다.                    |
+| `__fdiv_rz(x, y)`     | `x / y` (Float, Round Towards Zero)        | 0 방향으로 라운딩하여 나누기 연산을 수행합니다.                           |
+| `__fdiv_ru(x, y)`     | `x / y` (Float, Round Up)                  | 무조건 올림하여 나누기 연산을 수행합니다.                                 |
+| `__fdiv_rd(x, y)`     | `x / y` (Float, Round Down)                | 무조건 내림하여 나누기 연산을 수행합니다.                                 |
+| `__fsqrt_rn(x)`       | `sqrt(x)` (Float, Round to Nearest)        | 가장 가까운 값으로 라운딩하여 제곱근을 계산합니다.                         |
+| `__fsqrt_rz(x)`       | `sqrt(x)` (Float, Round Towards Zero)      | 0 방향으로 라운딩하여 제곱근을 계산합니다.                                |
+| `__fsqrt_ru(x)`       | `sqrt(x)` (Float, Round Up)                | 무조건 올림하여 제곱근을 계산합니다.                                      |
+| `__fsqrt_rd(x)`       | `sqrt(x)` (Float, Round Down)              | 무조건 내림하여 제곱근을 계산합니다.                                      |
+| `__dadd_rn(x, y)`     | `x + y` (Double, Round to Nearest)         | 더블 정밀도로 가장 가까운 값으로 라운딩하여 더하기 연산을 수행합니다.        |
+| `__dadd_rz(x, y)`     | `x + y` (Double, Round Towards Zero)       | 더블 정밀도로 0 방향으로 라운딩하여 더하기 연산을 수행합니다.               |
+| `__dadd_ru(x, y)`     | `x + y` (Double, Round Up)                 | 더블 정밀도로 무조건 올림하여 더하기 연산을 수행합니다.                     |
+| `__dadd_rd(x, y)`     | `x + y` (Double, Round Down)               | 더블 정밀도로 무조건 내림하여 더하기 연산을 수행합니다.                     |
+| `__dmul_rn(x, y)`     | `x * y` (Double, Round to Nearest)         | 더블 정밀도로 가장 가까운 값으로 라운딩하여 곱하기 연산을 수행합니다.        |
+| `__dmul_rz(x, y)`     | `x * y` (Double, Round Towards Zero)       | 더블 정밀도로 0 방향으로 라운딩하여 곱하기 연산을 수행합니다.               |
+| `__dmul_ru(x, y)`     | `x * y` (Double, Round Up)                 | 더블 정밀도로 무조건 올림하여 곱하기 연산을 수행합니다.                     |
+| `__dmul_rd(x, y)`     | `x * y` (Double, Round Down)               | 더블 정밀도로 무조건 내림하여 곱하기 연산을 수행합니다.                     |
+| `__fma_rn(x, y, z)`   | `x * y + z` (Double, Round to Nearest)     | 더블 정밀도로 가장 가까운 값으로 라운딩하여 곱한 후 더하기 연산을 수행합니다. |
+| `__fma_rz(x, y, z)`   | `x * y + z` (Double, Round Towards Zero)   | 더블 정밀도로 0 방향으로 라운딩하여 곱한 후 더하기 연산을 수행합니다.        |
+| `__fma_ru(x, y, z)`   | `x * y + z` (Double, Round Up)             | 더블 정밀도로 무조건 올림하여 곱한 후 더하기 연산을 수행합니다.              |
+| `__fma_rd(x, y, z)`   | `x * y + z` (Double, Round Down)           | 더블 정밀도로 무조건 내림하여 곱한 후 더하기 연산을 수행합니다.              |
+| `__drcp_rn(x)`        | `1/x` (Double, Round to Nearest)           | 더블 정밀도로 가장 가까운 값으로 라운딩하여 역수를 계산합니다.               |
+| `__drcp_rz(x)`        | `1/x` (Double, Round Towards Zero)         | 더블 정밀도로 0 방향으로 라운딩하여 역수를 계산합니다.                      |
+| `__drcp_ru(x)`        | `1/x` (Double, Round Up)                   | 더블 정밀도로 무조건 올림하여 역수를 계산합니다.                            |
+| `__drcp_rd(x)`        | `1/x` (Double, Round Down)                 | 더블 정밀도로 무조건 내림하여 역수를 계산합니다.                            |
+| `__ddiv_rn(x, y)`     | `x / y` (Double, Round to Nearest)         | 더블 정밀도로 가장 가까운 값으로 라운딩하여 나누기 연산을 수행합니다.         |
+| `__ddiv_rz(x, y)`     | `x / y` (Double, Round Towards Zero)       | 더블 정밀도로 0 방향으로 라운딩하여 나누기 연산을 수행합니다.                |
+| `__ddiv_ru(x, y)`     | `x / y` (Double, Round Up)                 | 더블 정밀도로 무조건 올림하여 나누기 연산을 수행합니다.                      |
+| `__ddiv_rd(x, y)`     | `x / y` (Double, Round Down)               | 더블 정밀도로 무조건 내림하여 나누기 연산을 수행합니다.                      |
+| `__dsqrt_rn(x)`       | `sqrt(x)` (Double, Round to Nearest)       | 더블 정밀도로 가장 가까운 값으로 라운딩하여 제곱근을 계산합니다.              |
+| `__dsqrt_rz(x)`       | `sqrt(x)` (Double, Round Towards Zero)     | 더블 정밀도로 0 방향으로 라운딩하여 제곱근을 계산합니다.                     |
+| `__dsqrt_ru(x)`       | `sqrt(x)` (Double, Round Up)               | 더블 정밀도로 무조건 올림하여 제곱근을 계산합니다.                           |
+| `__dsqrt_rd(x)`       | `sqrt(x)` (Double, Round Down)             | 더블 정밀도로 무조건 내림하여 제곱근을 계산합니다.                           |