date: 2023-12-22 09:44:42

tags: malloc memory check

代码

#include <iostream>
#include <malloc.h>
using namespace std;

int main()
{
  // 调用mallinfo()函数获取堆内存的统计信息
  struct mallinfo mi;
  int *a[10];
  size_t old=0;
  size_t delta=0;
  for(int i=0;i<10;i++)
  {
    a[i]=0;
    a[i] = (int*)malloc(40000000);
    mi = mallinfo();
    // 计算堆内存的已使用大小
    size_t heap_used = mi.arena + mi.hblkhd;
    // 打印堆内存的统计信息
    delta = heap_used - old;
    cout << "heap_used: " << heap_used << " bytes| delta: " << delta << endl;
    cout << mi.arena<<" "<<mi.ordblks<<" "<<mi.smblks<<" "<<mi.hblks<<endl;
    old = heap_used;
  }
  return 0;
}

输出结果

heap_used: 40214528 bytes| delta: 40214528
212992 1 0 1
heap_used: 80216064 bytes| delta: 40001536
212992 1 0 2
heap_used: 120217600 bytes| delta: 40001536
212992 1 0 3
heap_used: 160219136 bytes| delta: 40001536
212992 1 0 4
heap_used: 200220672 bytes| delta: 40001536
212992 1 0 5
heap_used: 240222208 bytes| delta: 40001536
212992 1 0 6
heap_used: 280223744 bytes| delta: 40001536
212992 1 0 7
heap_used: 320225280 bytes| delta: 40001536
212992 1 0 8
heap_used: 360226816 bytes| delta: 40001536
212992 1 0 9
heap_used: 400228352 bytes| delta: 40001536
212992 1 0 10

date: 2023-03-30 15:26:14

tags: SACA FORTRAN CRTS

主核代码 main.f90

program main
    use utility
    implicit none
    integer :: i, j
    integer :: N = 10
    integer, pointer :: matrix_A(:,:)
    integer, pointer :: vector_B(:)
    integer, pointer :: vector_C(:)
    type(ParaType)::para
    integer, external :: slave_func
    allocate(matrix_A(N,N), vector_B(N), vector_C(N))
    para%N = N
    para%matrix_A => matrix_A
    para%vector_B => vector_B
    para%vector_C => vector_C
    print *, "A and B"
    do i = 1, N
        do j = 1, N
            matrix_A(j,i) = floor(11*rand())
            write(*, "(I4)", advance="no") matrix_A(j,i)
        end do
        vector_B(i) = floor(11*rand())
        write(*, "(I4)", advance="no") vector_B(i)
        print *
    end do
    call CRTS_Init()
    call CRTS_athread_spawn(slave_func, para)
    call CRTS_athread_join()
    print *, "C = A.dot(B)"
    do i = 1, N
        write(*, "(I4)", advance="no") vector_C(i)
    end do
    print *
end program main

工具模块 utility.f90

module utility
    implicit none
    type, public :: ParaType
        integer :: N
        integer,pointer :: matrix_A(:,:)
        integer,pointer :: vector_B(:)
        integer,pointer :: vector_C(:)
    end type ParaType
interface
    integer function CRTS_smng_get_tid()
    end function CRTS_smng_get_tid
end interface
contains
end module utility

从核代码 slave.f90

subroutine slave_func(para)
    use utility
    implicit none
    type(ParaType), intent(in) :: para
    integer :: N
    integer, pointer :: matrix_A(:,:)
    integer, pointer :: vector_B(:)
    integer, pointer :: vector_C(:)
    integer :: i, j
    integer :: myid
    integer :: tmp
    myid = CRTS_smng_get_tid()
    N = para%N
    if(myid < N) then
        matrix_A => para%matrix_A
        vector_B => para%vector_B
        vector_C => para%vector_C
        tmp = 0
        do i = 1,N
            tmp = tmp + matrix_A(i, myid+1) * vector_B(i)
        end do
        vector_C(myid+1) = tmp
    end if
end subroutine slave_func

Makefile

FC = swgfortran
main: main.o slave.o utility.o
	$(FC) -mhybrid $^ -o $@
main.o: main.f90 utility.o
	$(FC) -mhost -c $<
slave.o: slave.f90 utility.o
	$(FC) -mslave -c slave.f90
utility.o: utility.f90
	$(FC) -c $<
run:
	bsub -I -b -q q_linpack -cgsp 64 -n 1 ./main
clean:
	rm -rf *.o main *.mod

程序输出

1680161547625

date: 2022-08-22 14:34:18

tags:

cblas_dgemm参数详解

[dragon](mailto: wauqas@gmail.com) 22-8-15

代码

#include <cblas.h>
#include <stdio.h>

int main()
{
  int i = 0;
  double A[6] = {1.0, 10.0, 20.0, 30.0, 40.0, 50.0};
  double B[6] = {1.0, 10.0, 20.0, 30.0, 40.0, 60.0};
  double C[9] = {.5, .5, .5, .5, .5, .5, .5, .5, .5};
  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, \
              3, 3, 2, 1, A, 3, B, 3, 0, C, 3);
  for (int j = 0; j < 3; j++)
  {
    for (i = 0; i < 3; i++)
      printf("%lf ", C[i*3+j]);
    printf("\n");
  }
  return 0;
}

解释

void cblas_dgemm(
    	OPENBLAS_CONST enum CBLAS_ORDER Order, //行列主序
        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, //矩阵A转置
        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, //矩阵B转置
        OPENBLAS_CONST blasint M, //op(A)的行数
        OPENBLAS_CONST blasint N, //op(B)的列数
        OPENBLAS_CONST blasint K, //op(A)的列数和op(B)的行数
        OPENBLAS_CONST double alpha, //A的缩放
        OPENBLAS_CONST double *A, //matrix A
        OPENBLAS_CONST blasint lda, //A的第一维度,跟主序有关
       	OPENBLAS_CONST double *B, //matrix B
        OPENBLAS_CONST blasint ldb, //B的第一维度
        OPENBLAS_CONST double beta, //C 的缩放
        double *C, //matrix C
        OPENBLAS_CONST blasint ldc //C的第一维度
);

最终结果为$C=alpha*op(A)op(B)+betaC$

date: 2022-08-22 16:39:25

tags: cuda

CUDA

cuda program kernel function:

__global__ void functionName(){}

void __global__ functionName(){}

Hello World

Hello.cu

```c++ #include __global__ void GPU_print(){ printf("Hello World\n"); } int main(int argc,char **argv){ GPU_print<<<2,2>>>(); cudaDeviceSynchronize(); return 0; } ```

Compile:

nvcc Hello.cu -o Hello

Execute:

./Hello

Result:

Hello World

why? <<<2,2>>>

CUDA kernel run in Device. We should realise the differences of Host datas with Device datas.

Let's see the CUDA functions !

cudaMalloc(

					(void\*\*)&ptr,  int \*ptr -> (void\*\*)&ptr 

					size, sizeof(ptr_type);

				)

ptr will get some places from Device

cudaMemcpy(

				dst, date to dst

				src, date from src

				size, how many bytes

				kind,where to where(cudaMemcpyHostToDevice,cudaMemcpyDeviceToHost and ......)

				)

cudaDeviceSynchronize()

Wait operate in Device finish

GPU has lot's of same things with CPU

different kinds of memory is one of the same things.

dynamic common Element

```c++ double *a; int M=1000; cudaMalloc((void**)&a,M); cudaMemcpy(....); kernel<<>>(a); ```

static global Element

global and static

__device__ double a[5];
__device__ int b;

compiler should know the size of static global Element,and we need't be used by arg-way

needn't kernel<<<1,1>>>(a);

constant Element by arg-way

```c++ int a=1; kernel<<<1,1>>>(a); ```

read-only and max size is 4KB

constant Element by constant

```c++ __constant__ int a=1; ```

read-only and max size is 64 in most NVIDIA GPU

How to copy datas from/to constant Element ?

cudaMemcpyFromSymbol(

											dst, datas to dst

											src, datas from src

											copykind, cudaMemcpyDeviceToHost

											)

cudaMemcpyToSymbol(

										dst, datas to dst

										src, datas from src

										copykind, cudaMemcpyHostToDevice

										)

dynamic shared memory

```c++ __global__ void kernel(){ int n=10; __shared__ int a[n]; } ```

read and write, 64KB per block. same block with same shared memory value

static shared memory

outside:

kernel<<<1,1,sharedMemorySize>>>()

inside:

extern __shared__ double a[]

not *a, pointer isn't array

register memory

fast and small

__global__ void kernel(){
    int a=1;
    const int b=1;
}

dynamic unified memory

```c++ double *x,*y; const int M=sizeof(double)*10000; cudaMallocManaged((void**)&x,M); cudaMallocManaged((void**)&y,M); *x=1; *y=2; kernel<<>>(x,y); ```

static unified memory

```c++ __device__ __managed__ int ret[1000]; __device__ __managed__ int a; int main(){ kernel<<>>(); cudaSynchronize(); printf("%d\n",a); } ```

Free Memory

`cudaFree(void* ptr)`

Atoi operate

look like synchronize, but really without synchronize

Table

CUDA Stream

Let's see the functions!

type: cudaStream_t stream

cudaStreamCreate(&stream)

cudaStreamDestory(stream)

cudaStreamSynchronize(stream), wait stream finish

cudaStreamQuery(stream), check stream finish or not, cudaSuccess or cudaErrorNotReady

run in stream : kernel<<<gridSize, blockSize,shared_size,stream>>>();

cudaMemcpyAsync(

								void *dst

								const void *src

								size_t count

								enum cudaMemcpyKind kind

								cudaStream_t stream, if you want to use default stream, by 0

								)

which datas can be MemcpyAsync ?

cudaMallocHost(void**ptr, size_t size)

cudaHostAlloc(void**ptr, size_t size,size_t flag)

if flag==cudaHostAllocDefault, cudaMallocHost equal to cudaHostAlloc

Free:cudaFreeHost(void* ptr)

date: 2022-08-23 14:52:12

tags: vscode fortran debug

please make sure you have gdb and gfortran

launch.json

{
    "version": "0.0.1",
    "configurations": [
        {
            "name": "Fortran Launch (GDB)",
            "type": "cppdbg",
            "request": "launch",
            "targetArchitecture": "x64",
            "program": "${workspaceRoot}\\${fileBasenameNoExtension}.exe",
            "miDebuggerPath": "D:\\cygwin64\\bin\\gdb.exe",
            "args": [],
            "stopAtEntry": false,
            "cwd": "${workspaceRoot}",
            "externalConsole": true,
            "preLaunchTask": "gfortran"
        }
    ] }

"miDebuggerPath" insert your gdb full path.

tasks.json

{
    "tasks": [
        {
            "label": "gfortran",
            "type": "shell",
            "command": "gfortran -o ${fileBasenameNoExtension}.exe  ${fileBasename} -g",
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "group": {
                "kind": "build",
                "isDefault": true
            }            
        },
  ],
    "version": "2.0.0"
}

configure finish

method of using

1661237821036

there are three or more ways to debug the example code.

you can also set some break points.

Wish you best.

date: 2023-03-02 15:12:08

tags: gpt python

在3月1日刚发布的gpt-3.5-turbo，学习一下它的调用方法

以post方法调用

import requests

url = 'https://api.openai.com/v1/chat/completions'

Question = input("Question:")

data = {"model": "gpt-3.5-turbo","messages": [{"role": "user", "content": Question}]}

headers = {
    'Authorization': 'Bearer sk-' #后面输入自己的api-key
}

response = requests.post(url, json=data, headers=headers)

if response.status_code == 200:
    x = response.json()
    print(x['choices'][0]['message']['content'])
else:
    print("error:",response.status_code)

启动程序之后输入问题然后发送请求。

以python openai模块调用

import openai
openai.api_key="sk-"
message=[]
while(True):
    Q = input("user:")
    message.append({"role":"user","content":Q})
    com=openai.ChatCompletion.create(model="gpt-3.5-turbo",messages=message)
    message.append({"role":"assistant","content":com.choices[0].message.content})
    print("assistant:",com.choices[0].message.content)

跟上面的代码功能有所不同的是，这个可以联系上下文，类似网页上与chatgpt对话的过程。用户的角色设置为user，ai的角色设置为assistant。目前不管是直接post请求还是用python的openai模块，都不需要连接外网，平时使用较为方便。

程序大小问题

在实际测试后发现：直接用post的版本，生成exe的大小在6M左右。而使用openai模块的版本，因为要打包openai模块，大小在70M左右。所以还是建议用post的版本以下程序也可实现联系上下文

import requests

url = 'https://api.openai.com/v1/chat/completions'

headers = {
    'Authorization': 'Bearer sk-'
}

message=[]

while(True):
    Q = input("user:")
    message.append({"role":"user","content":Q})
    data = {"model": "gpt-3.5-turbo","messages": message}
    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        x = response.json()
        print("assistant:",x['choices'][0]['message']['content'])
        message.append({"role":"assistant","content":x['choices'][0]['message']['content']})
    else:
        print("error:",response.status_code)
        break

date: 2022-08-23 11:10:23

tags: cuda cpp

make_unique_with_cuda

#include "stdio.h"
#include <memory>
namespace cuda
{
    template <typename T>
    [[nodiscard]] static auto malloc(std::size_t const size)
    {
        //nodiscard implies must use it return value, or will encounter an error
        static T *d{nullptr};
        cudaMalloc(&d, sizeof(T) * size);
        return d;
    }
    template <typename T>
    static void free(T *ptr)
    {
        if (ptr)
        {
            cudaFree(ptr);
            ptr=nullptr;
        }
    }
    template <typename T>
    [[nodiscard]] static auto makeUnique(std::size_t size)
    {
        return std::unique_ptr<T[], decltype(&free<T>)> { malloc<T>(size), free<T> };
    }
} // namespace name
__global__ void kernel(float3 *d)
{
    int id = threadIdx.x;
    // d[id].x=d[id].y=d[id].z=id*1.1;
    printf("%g\t%g\t%g\n", d[id].x, d[id].y, d[id].z);
}
void showh(float3 *d)
{
    for (int id = 0; id < 5; id++)
    {
        printf("%g\t%g\t%g\n", d[id].x, d[id].y, d[id].z);
    }
}
using namespace cuda;
int main(void)
{
    auto const count = 5;
    auto hp_points{std::make_unique<float3[]>(count)};
    for (int i = 0; i < count; i++)
    {
        hp_points[i].x=i*1.1;
        hp_points[i].y=i*1.1;
        hp_points[i].z=i*1.1;
    }
    // showh(hp_points.get());
    auto dp_points{cuda::makeUnique<float3>(count)};
    cudaMemcpy(dp_points.get(),hp_points.get(), //get() will return unique ptr's address
    sizeof(float3)*count,cudaMemcpyHostToDevice);
    kernel<<<1,5>>>(dp_points.get());
    cudaDeviceSynchronize();
    dp_points.~unique_ptr();
    return 0;
}

date: 2022-11-17 14:36:31

tags: makefile wildcard filter-out patsubst

文件列表

main.f90
makefile
sub1.f90
sub2.f90
sub3.f90

makfile代码

main = main.f90

SRC = $(wildcard *.f90)

SRC := $(filter-out $(main),$(SRC))

SRC := $(patsubst %.f90, %.o, $(SRC))

main:

    echo $(SRC)

wildcard 在对变量调用时，保持通配符特性
filter-out 过滤器，删除后面跟的第一个参数
patsubst 将变量中元素根据参数替换，上述代码将所有.f90替换为.o

执行结果

$> make
echo  sub1.o  sub2.o  sub3.o
 sub1.o  sub2.o  sub3.o

date: 2023-03-13 15:50:12

tags:

MPI_Cart_Create

将一维的通信空间转为多维

接口

subroutine MPI_Cart_shift(comm, direction, disp, rank_source, rank_dest)
        integer,intent(in)  ::comm, 		!通信域
        integer,intent(in)  ::direction,    !维度序号
        integer,intent(in)  ::disp,		   	!偏移量
  		integer,intent(out) ::rank_source,	!向本进程发送数据的进程 如果没有则为-1
  	    integer,intent(out) ::rank_dest		!本进程发送数据的目的进程 如果没有则为-1
end subroutine MPI_Cart_shift

测试

查看period对网络生成的影响

program main
    use mpi
    integer ierr
    integer rank,size
    integer np_dim(2)
    logical period(2)
    integer mpi_world_cart
    integer src,dest
    np_dim=3
    period(1)=.false.
    period(2)=.true.
    call MPI_Init(ierr)
    call MPI_Cart_create( MPI_COMM_WORLD    &
                            , 2                 &
                            , np_dim            &
                            , period            &
                            , .false.           &
                            , mpi_world_cart    &
                            , ierr              &
                            )
    call MPI_Comm_rank(mpi_world_cart,rank,ierr)
    call MPI_Comm_size(mpi_world_cart,size,ierr)
    call MPI_Cart_shift(mpi_world_cart, 0, 1, src, dest,ierr)
    call MPI_Finalize(ierr)
    print *,rank,"of",size,"|",src,"of",dest,"|",(ierr==0)
end program main

输出

           6 of           9 |           3 of          -1 | T 
           1 of           9 |          -1 of           4 | T
           2 of           9 |          -1 of           5 | T
           0 of           9 |          -1 of           3 | T
           5 of           9 |           2 of           8 | T 以此条为例，5从2接收，发送到8
           4 of           9 |           1 of           7 | T
           3 of           9 |           0 of           6 | T
           7 of           9 |           4 of          -1 | T
           8 of           9 |           5 of          -1 | T

第一维度的period设置为.false.，导致网络在第一维度不是环状的，而是线性的；

第二维度的period设置为.true.，导致网络在第二维度是环状的，最后一个进程的下一个进程是第一个进程。

这种设置方式会产生一个类似圆柱形的结构，圆柱的底是头尾连接的，圆柱的高是头尾分离的。

因为MPI_Cart_shift选取的维度是第一维度，且period(1)==.false.，所以0~2进程的src为-1，6~8进程的dest为-1。

以下是period(1)==.true.时程序的输出。

           0 of           9 |           6 of           3 | T
           6 of           9 |           3 of           0 | T
           8 of           9 |           5 of           2 | T 8从5接受，发送到2
           7 of           9 |           4 of           1 | T
           1 of           9 |           7 of           4 | T
           3 of           9 |           0 of           6 | T
           2 of           9 |           8 of           5 | T
           4 of           9 |           1 of           7 | T
           5 of           9 |           2 of           8 | T

MPI_Cart_Create函数的reorder参数是用来指定创建出来的Cartesian拓扑是否可以被重新排序的。具体来说，如果reorder参数被设置为1，那么MPI库就可以为了提高性能而重新排列进程的拓扑结构。如果reorder参数被设置为0，那么MPI库就必须按照进程的原始顺序来创建Cartesian拓扑结构。

当reorder参数被设置为1时，MPI库可以为了提高性能而重新排列进程的拓扑结构。例如，假设原始的进程布局如下所示：

0 1 2 3
4 5 6 7
8 9 10 11

如果reorder参数被设置为1，MPI库可以重新排列进程的拓扑结构，例如：

0 4 8 9
1 5 6 10
2 3 7 11

这样做的目的是为了使相邻的进程在物理空间中更接近，从而减少通信延迟。但是需要注意的是，重新排序进程的拓扑结构可能会影响到原始程序的正确性，因此在使用该特性时需要小心谨慎。

date: 2022-08-25 09:17:16

tags: openblas

OpenBlas-Make

environment:

redhat
gcc version 4.8.5 20150623 (Red Hat 4.8.5-36) (GCC)

build

make CC=gcc FC=gfortran make_NB_JOBS=1
make install PREFIX=<install_path>

if you don't make with suffix make_NB_JOBS, and has a old lapack in you machine,

openblas will generate before included lapack built, And won't call the old lapack lib.

test

#include <cblas.h>
#include <stdio.h>

int main()
{
  int i=0;
  double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};         
  double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};  
  double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; 
  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);

  for(i=0; i<9; i++)
    printf("%lf ", C[i]);
  printf("\n");
  return 0;
}

$ gcc -I ~/OpenBLAS/include  -L ~/OpenBLAS/lib -lopenblas main.cpp -o solver
$ ./solver
11.000000 -9.000000 5.000000 -9.000000 21.000000 -1.000000 5.000000 -1.000000 3.000000

date: 2023-01-08 21:20:37

tags:

离线安装

若计算机中的python版本为Python 3.6.8，需要安装python3.8.5

PyG is available for Python 3.7 to Python 3.10.

安装Python3.8.5

https://www.python.org/downloads/release/python-385/

tar.xz文件解压命令 tar xf

进入解压后的文件，执行安装三部曲

./configure --prefix=${HOME}/Python3.8.5
make
make install

修改.bashrc，这次不但要添加python目录到python，还需要加alias将/usr/bin/下的python3屏蔽

#Python-3.8.5
export PATH=$PATH:${HOME}/Python3.8.5/bin
alias python3=${HOME}/Python3.8.5/bin/python3

检查版本

[liyl@gpu-node1 ~]$ python3
Python 3.8.5 (default, Jan  5 2023, 14:09:20) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-44)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>

安装wheel

pip3 install wheel-0.37.1-py2.py3-none-any.whl --user

安装numpy

https://pypi.org/project/numpy/1.23.5/#files numpy-1.23.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

安装包已经放到文档目录

python3 -m pip install numpy-1.23.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --user

--user：安装到用户目录，不需要sudo权限，安装位置为**./.local/lib/python3.8/site-packages**

安装typing_extensions

python3 -m pip install typing_extensions-4.4.0-py3-none-any.whl --user

安装pyTorch

http://download.pytorch.org/whl/torch_stable.html

torch-1.13.1+cpu-cp38-cp38-linux_x86_64.whl

python3 -m pip install torch-1.13.1+cpu-cp38-cp38-linux_x86_64.whl --user

注意事项

安装要注意先后关系

如果给你的安装包无法安装numpy或是pyTorch，首先检查python版本和主机所属平台，可以通过以下python代码实现查询

python3 -m pip debug --verbose

我的运行结果：

Compatible tags: 87
  cp38-cp38-manylinux2014_x86_64
  cp38-cp38-manylinux2010_x86_64
  cp38-cp38-manylinux1_x86_64
  cp38-cp38-linux_x86_64
  cp38-abi3-manylinux2014_x86_64
  cp38-abi3-manylinux2010_x86_64
  cp38-abi3-manylinux1_x86_64
  cp38-abi3-linux_x86_64
  cp38-none-manylinux2014_x86_64
  cp38-none-manylinux2010_x86_64
  cp38-none-manylinux1_x86_64
  cp38-none-linux_x86_64
  cp37-abi3-manylinux2014_x86_64
  cp37-abi3-manylinux2010_x86_64
  cp37-abi3-manylinux1_x86_64
  cp37-abi3-linux_x86_64
  cp36-abi3-manylinux2014_x86_64
  cp36-abi3-manylinux2010_x86_64
  cp36-abi3-manylinux1_x86_64
  cp36-abi3-linux_x86_64
  cp35-abi3-manylinux2014_x86_64
  cp35-abi3-manylinux2010_x86_64
  cp35-abi3-manylinux1_x86_64
  cp35-abi3-linux_x86_64
  cp34-abi3-manylinux2014_x86_64
  cp34-abi3-manylinux2010_x86_64
  cp34-abi3-manylinux1_x86_64
  cp34-abi3-linux_x86_64
  cp33-abi3-manylinux2014_x86_64
  cp33-abi3-manylinux2010_x86_64
  cp33-abi3-manylinux1_x86_64
  cp33-abi3-linux_x86_64
  cp32-abi3-manylinux2014_x86_64
  cp32-abi3-manylinux2010_x86_64
  cp32-abi3-manylinux1_x86_64
  cp32-abi3-linux_x86_64
  py38-none-manylinux2014_x86_64
  py38-none-manylinux2010_x86_64
  py38-none-manylinux1_x86_64
  py38-none-linux_x86_64
  py3-none-manylinux2014_x86_64
  py3-none-manylinux2010_x86_64
  py3-none-manylinux1_x86_64
  py3-none-linux_x86_64
  py37-none-manylinux2014_x86_64
  py37-none-manylinux2010_x86_64
  py37-none-manylinux1_x86_64
  py37-none-linux_x86_64
  py36-none-manylinux2014_x86_64
  py36-none-manylinux2010_x86_64
  py36-none-manylinux1_x86_64
  py36-none-linux_x86_64
  py35-none-manylinux2014_x86_64
  py35-none-manylinux2010_x86_64
  py35-none-manylinux1_x86_64
  py35-none-linux_x86_64
  py34-none-manylinux2014_x86_64
  py34-none-manylinux2010_x86_64
  py34-none-manylinux1_x86_64
  py34-none-linux_x86_64
  py33-none-manylinux2014_x86_64
  py33-none-manylinux2010_x86_64
  py33-none-manylinux1_x86_64
  py33-none-linux_x86_64
  py32-none-manylinux2014_x86_64
  py32-none-manylinux2010_x86_64
  py32-none-manylinux1_x86_64
  py32-none-linux_x86_64
  py31-none-manylinux2014_x86_64
  py31-none-manylinux2010_x86_64
  py31-none-manylinux1_x86_64
  py31-none-linux_x86_64
  py30-none-manylinux2014_x86_64
  py30-none-manylinux2010_x86_64
  py30-none-manylinux1_x86_64
  py30-none-linux_x86_64
  cp38-none-any
  py38-none-any
  py3-none-any
  py37-none-any
  py36-none-any
  py35-none-any
  py34-none-any
  py33-none-any
  py32-none-any
  py31-none-any
  py30-none-any

根据你的打印内容选择受支持的离线安装包

https://pypi.org/project/ 在这里根据包名查包

1672902124066

对numpy和torch进行简单调用

import torch as t
import numpy as n
a1 = t.ones(3,3)
a2 = t.ones(3,3)
b1 = n.ones((3,3))
b2 = n.ones((3,3))
print(a1+a2)
print(b1+b2)

输出

[liyl@gpu-node1 torch]$ python3 test.py 
tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]])
[[2. 2. 2.]
 [2. 2. 2.]
 [2. 2. 2.]]

pyG

pytorch-geometric简称pyG

https://pytorch-geometric.com/whl/torch-1.13.0%2Bcpu.html

检索需要的版本

安装其他依赖：

python3 -m pip install --user torch_scatter-2.1.0+pt113cpu-cp38-cp38-linux_x86_64.whl 
python3 -m pip install --user scipy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
python3 -m pip install --user torch_cluster-1.6.0+pt113cpu-cp38-cp38-linux_x86_64.whl
python3 -m pip install --user torch_sparse-0.6.16+pt113cpu-cp38-cp38-linux_x86_64.whl
python3 -m pip install --user torch_spline_conv-1.2.1+pt113cpu-cp38-cp38-linux_x86_64.whl
python3 -m pip install --user tqdm-4.64.1-py2.py3-none-any.whl
python3 -m pip install --user pytz-2022.7-py2.py3-none-any.whl
python3 -m pip install --user six-1.16.0-py2.py3-none-any.whl
python3 -m pip install --user python_dateutil-2.8.2-py2.py3-none-any.whl
python3 -m pip install --user pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
python3 -m pip install --user MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
python3 -m pip install --user Jinja2-3.1.2-py3-none-any.whl
python3 -m pip install --user certifi-2022.12.7-py3-none-any.whl
python3 -m pip install --user charset_normalizer-2.1.1-py3-none-any.whl
python3 -m pip install --user urllib3-1.26.13-py2.py3-none-any.whl
python3 -m pip install --user idna-3.4-py3-none-any.whl
python3 -m pip install --user requests-2.28.1-py3-none-any.whl
python3 -m pip install --user pyparsing-3.0.9-py3-none-any.whl
python3 -m pip install --user joblib-1.2.0-py3-none-any.whl
python3 -m pip install --user threadpoolctl-3.1.0-py3-none-any.whl
python3 -m pip install --user scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

从源码安装torch-geometric

pyG镜像链接

https://mirrors.aliyun.com/pypi/simple/torch-geometric

cd torch_geometric-2.0.4
python3 -m pip install . --user

测试能否导入成功

1672967530424

date: 2022-11-27 11:11:18

tags: x11 xorg

#include <X11/Xlib.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
int main()
{
    Display *dsp = XOpenDisplay(NULL);
    if (!dsp)
    {
        return 1;
    }
    int screenNumber = DefaultScreen(dsp);
    unsigned long white = WhitePixel(dsp, screenNumber);
    unsigned long black = BlackPixel(dsp, screenNumber);

    Window win = XCreateSimpleWindow(dsp,
                                     DefaultRootWindow(dsp), //parent
                                     50, 50,   // origin point
                                     400, 400, // size
                                     0, black, // border width and color
                                     white);   // backgd
    XMapWindow(dsp, win);
    Colormap screen_colormap = DefaultColormap(dsp, DefaultScreen(dsp));
    XColor red;
    int rc = XAllocNamedColor(dsp, screen_colormap, "red", &red, &red);
    // alloc color 'red' by its name
    if (rc == 0)
    {
        fprintf(stderr, "XAllocNamedColor - failed to allocated 'red' color.\n");
        exit(1);
    }

    GC gc = XCreateGC(dsp, win, 0, NULL);
    XSetForeground(dsp, gc, black);
    XDrawLine(dsp, win, gc, 10, 10, 190, 190); //draw a line with color black
    XFlush(dsp);
    sleep(1);
    XSetForeground(dsp, gc, red.pixel); // change color for next draw
    XDrawLine(dsp, win, gc, 210, 210, 390, 390); //draw a line with color red
    XFlush(dsp);
    getchar(); // shutdown key
    XDestroyWindow(dsp, win);
    XCloseDisplay(dsp);
    return 0;
}

date: 2024-06-06 15:53:08

tags: Fortran dynamic DLL LoadLibrary

Source code of DLL

subroutine test(a, n)
    implicit none
    !dec$ attributes dllexport::test
    integer::n
    integer::a(n)
    integer::i
    do i = 1, n
        a(i) = i*i
    end do
end subroutine test

Source code of executable file

use iso_c_binding
implicit none
interface
    function LoadLibrary(name) bind(c, name='LoadLibraryA')
        import :: c_ptr, c_char
        character(kind=c_char), intent(in) :: name(*)
        type(c_ptr) :: LoadLibrary
    end function LoadLibrary

    function GetProcAddress(hModule, lpProcName) bind(c, name='GetProcAddress')
        import :: c_ptr, c_char, c_funptr
        type(c_ptr), value :: hModule
        character(kind=c_char), intent(in) :: lpProcName(*)
        type(c_funptr) :: GetProcAddress
    end function GetProcAddress

    subroutine FreeLibrary(hModule) bind(c, name='FreeLibrary')
        import :: c_ptr
        type(c_ptr), value :: hModule
    end subroutine FreeLibrary
end interface

interface
    subroutine test_interface(a,n)
        integer::n
        integer::a(n)
    end subroutine test_interface
end interface

integer, parameter::n = 5
integer::a(n)

type(c_ptr) :: funcdll
type(c_funptr) :: funcptr
procedure(test_interface), pointer :: test => null()

funcdll = LoadLibrary("./func.dll"//c_null_char)
if (.not. C_ASSOCIATED(funcdll)) then
    print *, "Failed to load DLL."
end if
funcptr = GetProcAddress(funcdll, "TEST"//c_null_char)

if (.not. C_ASSOCIATED(funcptr)) then
    print *, "Failed to load func."
end if

call c_f_procpointer(funcptr,test)

call test(a,n)
print *,a

call FreeLibrary(funcdll)
end

compile commands
- generate func.dll
  
  ifx /DLL /o func.dll func.f90
- generate exe
  
  ifx main.f90
console output

1 4 9 16 25

date: 2023-12-22 09:44:42

tags: cuda workflow github

Ubuntu

name: CUDA CI

on:
  pull_request:
    branches:
      - master  # 你可以根据需要调整分支

jobs:
  build:
    runs-on: ubuntu-latest
    container: 
      image: nvidia/cuda:11.2.2-devel-ubuntu20.04  # 使用适合你的CUDA版本的官方NVIDIA容器

    steps:
      - name: Checkout code
        uses: actions/checkout@v2

      - name: Install dependencies
        run: |
          apt-get update
          apt-get install -y build-essential

      - name: Build with Make
        run: cd src && make

ubuntu

Windows

name: CUDA CI on Windows

on:
  pull_request:
    branches:
      - master

jobs:
  build:
    runs-on: windows-2019

    steps:
    - name: Checkout code
      uses: actions/checkout@v2      
      
    - name: Set up MSVC environment
      uses: ilammy/msvc-dev-cmd@v1.13.0
      with:
        arch: x64
        vsversion: "2019"
      
    - name: Install CUDA      
      run: |
        choco install cuda --version=11.2.2.46133
        
    - name: Build project
      shell: cmd
      run: |
        set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin
        nvcc --version
        cd src && make

ubuntu

DailyNotes