1 Star 0 Fork 0

Velcon-Zheng/cutorch

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
init.c 32.60 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152
#include "utils.h"
#include "luaT.h"
#include "THCGeneral.h"
#include "THCCachingAllocator.h"
#include "THCCachingHostAllocator.h"
#include "THCSleep.h"
#include "THCTensorRandom.h"
#include "THCHalf.h" // for CUDA_HALF_TENSOR
extern void cutorch_CudaByteStorage_init(lua_State* L);
extern void cutorch_CudaCharStorage_init(lua_State* L);
extern void cutorch_CudaShortStorage_init(lua_State* L);
extern void cutorch_CudaIntStorage_init(lua_State* L);
extern void cutorch_CudaLongStorage_init(lua_State* L);
extern void cutorch_CudaStorage_init(lua_State* L);
extern void cutorch_CudaDoubleStorage_init(lua_State* L);
#ifdef CUDA_HALF_TENSOR
extern void cutorch_CudaHalfStorage_init(lua_State* L);
#else
extern void cutorch_HalfStorageCopy_init(lua_State *L);
#endif
extern void cutorch_CudaByteTensor_init(lua_State* L);
extern void cutorch_CudaCharTensor_init(lua_State* L);
extern void cutorch_CudaShortTensor_init(lua_State* L);
extern void cutorch_CudaIntTensor_init(lua_State* L);
extern void cutorch_CudaLongTensor_init(lua_State* L);
extern void cutorch_CudaTensor_init(lua_State* L);
extern void cutorch_CudaDoubleTensor_init(lua_State* L);
#ifdef CUDA_HALF_TENSOR
extern void cutorch_CudaHalfTensor_init(lua_State* L);
#else
extern void cutorch_HalfTensorCopy_init(lua_State *L);
#endif
extern void cutorch_CudaByteTensorOperator_init(lua_State* L);
extern void cutorch_CudaCharTensorOperator_init(lua_State* L);
extern void cutorch_CudaShortTensorOperator_init(lua_State* L);
extern void cutorch_CudaIntTensorOperator_init(lua_State* L);
extern void cutorch_CudaLongTensorOperator_init(lua_State* L);
extern void cutorch_CudaTensorOperator_init(lua_State* L);
extern void cutorch_CudaDoubleTensorOperator_init(lua_State* L);
#ifdef CUDA_HALF_TENSOR
extern void cutorch_CudaHalfTensorOperator_init(lua_State* L);
#endif
extern void cutorch_CudaByteTensorMath_init(lua_State* L);
extern void cutorch_CudaCharTensorMath_init(lua_State* L);
extern void cutorch_CudaShortTensorMath_init(lua_State* L);
extern void cutorch_CudaIntTensorMath_init(lua_State* L);
extern void cutorch_CudaLongTensorMath_init(lua_State* L);
extern void cutorch_CudaTensorMath_init(lua_State* L);
extern void cutorch_CudaDoubleTensorMath_init(lua_State* L);
#ifdef CUDA_HALF_TENSOR
extern void cutorch_CudaHalfTensorMath_init(lua_State* L);
#endif
/*
Iteration utilities for lists of streams and lists of gpus with streams
*/
int checkAndCountListOfStreams(lua_State *L, THCState *state, int arg,
int device)
{
if (!lua_istable(L, arg)) {
THError("expecting array of device streams");
}
/* Push table to top */
lua_pushvalue(L, arg);
/* Check that all values in the table are numeric and in bounds */
int streams = 0;
lua_pushnil(L);
while (lua_next(L, -2)) {
if (!lua_isnumber(L, -2)) {
THError("expected array of streams, not table");
}
if (!lua_isnumber(L, -1)) {
THError("array of stream ids must contain numeric ids");
}
int streamId = (int) lua_tonumber(L, -1);
/* This will error out if the stream is not in bounds */
THCState_getDeviceStream(state, device, streamId);
++streams;
lua_pop(L, 1);
}
/* Pop table from top */
lua_pop(L, 1);
return streams;
}
void checkAndCountListOfGPUStreamPairs(lua_State *L, THCState *state, int arg,
int* gpus,
int* streams)
{
if (!lua_istable(L, arg)) {
THError("expecting table of gpu={streams...}");
}
/* Push table to top */
lua_pushvalue(L, arg);
/* Check that all values in the table are tables of numeric and in bounds */
*gpus = 0;
*streams = 0;
lua_pushnil(L);
while (lua_next(L, -2)) {
/* -2 is key (device), -1 is value, in the form device={streams...} */
if (!lua_isnumber(L, -2) || !lua_istable(L, -1)) {
THError("expecting table of gpu={streams...}");
}
int device = (int) lua_tonumber(L, -2) - 1;
/* Verify device is in range */
if (device < 0 || device >= THCState_getNumDevices(state)) {
THError("%d is not a device", device + 1);
}
/* Verify that the list is a list of streams */
*streams += checkAndCountListOfStreams(L, state, -1, device);
++(*gpus);
lua_pop(L, 1);
}
/* Pop table from top */
lua_pop(L, 1);
}
int createSingleDeviceEvents(lua_State *L, THCState *state, int arg,
int device, cudaEvent_t* event)
{
/* Push table to top */
lua_pushvalue(L, arg);
/* Record events */
lua_pushnil(L);
int i = 0;
while (lua_next(L, -2)) {
int streamId = (int) lua_tonumber(L, -1);
cudaStream_t streamWaitingOn =
THCState_getDeviceStream(state, device, streamId);
THCudaCheck(cudaEventCreateWithFlags(&event[i], cudaEventDisableTiming));
THCudaCheck(cudaEventRecord(event[i], streamWaitingOn));
lua_pop(L, 1);
i++;
}
/* Pop table from top */
lua_pop(L, 1);
return i;
}
void createMultiDeviceEvents(lua_State *L, THCState *state, int arg,
cudaEvent_t* events)
{
/* Push {gpu={streams...}} table */
lua_pushvalue(L, arg);
/* Create and record events per each GPU */
int gpu = 0;
lua_pushnil(L);
while (lua_next(L, -2)) {
int device = (int) lua_tonumber(L, -2) - 1;
THCudaCheck(cudaSetDevice(device));
events += createSingleDeviceEvents(L, state, -1, device, events);
++gpu;
lua_pop(L, 1);
}
/* Pop {gpu={streams...}} table */
lua_pop(L, 1);
}
void waitSingleDeviceEvents(lua_State *L, THCState *state, int arg,
int device, cudaEvent_t * event, int numEvents)
{
/* Push table to top */
lua_pushvalue(L, arg);
/* Then, wait on the events. Each stream is actually waiting on itself here
too, but that's harmless and isn't worth weeding out. */
lua_pushnil(L);
while (lua_next(L, -2)) {
int streamId = (int) lua_tonumber(L, -1);
cudaStream_t stream =
THCState_getDeviceStream(state, device, streamId);
for (int i = 0; i < numEvents; i++) {
THCudaCheck(cudaStreamWaitEvent(stream, event[i], 0));
}
lua_pop(L, 1);
}
/* Pop table from top */
lua_pop(L, 1);
}
void waitMultiDeviceEvents(lua_State *L, THCState *state, int arg,
cudaEvent_t* events, int streams)
{
/* Push {gpu={streams...}} table */
lua_pushvalue(L, arg);
/* Then, wait on the events. Each stream is actually waiting on itself here
too, but that's harmless and isn't worth weeding out. */
lua_pushnil(L);
while (lua_next(L, -2)) {
int device = (int) lua_tonumber(L, -2) - 1;
THCudaCheck(cudaSetDevice(device));
/* Push stream table */
lua_pushvalue(L, -1);
lua_pushnil(L);
while (lua_next(L, -2)) {
int streamId = (int) lua_tonumber(L, -1);
cudaStream_t stream =
THCState_getDeviceStream(state, device, streamId);
/* Each stream waits on all events */
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaStreamWaitEvent(stream, events[i], 0));
}
lua_pop(L, 1);
}
/* Pop stream table and GPU entry */
lua_pop(L, 2);
}
/* Pop {gpu={streams...}} table */
lua_pop(L, 1);
}
/* Synchronizes the host with respect to the current device */
static int cutorch_synchronize(lua_State *L)
{
THCudaCheck(cudaDeviceSynchronize());
return 0;
}
/* Synchronizes the host with respect to all devices */
static int cutorch_synchronizeAll(lua_State *L)
{
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
int devices = -1;
THCudaCheck(cudaGetDeviceCount(&devices));
for (int i = 0; i < devices; ++i) {
THCudaCheck(cudaSetDevice(i));
THCudaCheck(cudaDeviceSynchronize());
}
THCudaCheck(cudaSetDevice(prevDev));
return 0;
}
/*
Usage:
cutorch.reserveStreams(n)
Allocates n user streams for every device present. If fewer than
n streams are currently allocated, an additional number will be added.
If more than n streams are currently allocated, does nothing.
The default CUDA stream is assumed to be stream 0 and is always present;
the allocated streams are user streams on top of the CUDA streams
(thus, reserveStreams(1) will create 1 user stream with two being available,
the default stream 0 and the user stream 1, on each device).
*/
static int cutorch_reserveStreams(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int numStreams = (int) luaL_checknumber(L, 1);
int nonBlocking = lua_toboolean(L, 2);
THCState_reserveStreams(state, numStreams, nonBlocking);
return 0;
}
/*
Usage:
cutorch.reserveBlasHandles(n)
Allocates n blasHandles for every device present. If fewer than
n blasHandles are currently allocated, an additional number will be added.
If more than n blasHandles are currently allocated, does nothing.
Unlike for streams, there is no default blasHandle.
*/
static int cutorch_reserveBlasHandles(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int numHandles = (int) luaL_checknumber(L, 1);
THCState_reserveBlasHandles(state, numHandles);
return 0;
}
/*
Usage:
n = cutorch.getNumStreams()
Returns the number of user streams allocated for every device present.
By default, is 0.
*/
static int cutorch_getNumStreams(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushnumber(L, THCState_getNumStreams(state));
return 1;
}
/*
Usage:
n = cutorch.getNumBlasHandles()
Returns the number of user blasHandles allocated for every device present.
By default, is 1.
*/
static int cutorch_getNumBlasHandles(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushnumber(L, THCState_getNumBlasHandles(state));
return 1;
}
/*
Usage:
cutorch.setStream(n)
For all devices, sets the current user stream in use to the index
specified. e.g.,
---
cutorch.setDevice(1)
cutorch.setStream(3)
-- device 1 stream 3 in use here
cutorch.setDevice(2)
-- device 2 stream 3 in use here
---
0 is the default stream on the device.
*/
static int cutorch_setStream(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int stream = (int) luaL_checknumber(L, 1);
THCState_setCurrentStreamIndex(state, stream);
return 0;
}
/*
Usage:
cutorch.setBlasHandle(n)
For all devices, sets the current blasHandle in use to the index
specified. e.g.,
---
cutorch.setDevice(1)
cutorch.setBlasHandle(3)
-- device 1 blasHandle 3 in use here
cutorch.setDevice(2)
-- device 2 blasHandle 3 in use here
---
*/
static int cutorch_setBlasHandle(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int handle = (int) luaL_checknumber(L, 1);
THCState_setCurrentBlasHandleIndex(state, handle);
return 0;
}
/*
Usage:
n = cutorch.getStream()
Returns the current user stream for all devices in use (as previously
set via cutorch.setStream(n). 0 is the default stream on the device
and is its initial value.
*/
static int cutorch_getStream(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushnumber(L, THCState_getCurrentStreamIndex(state));
return 1;
}
/*
Usage:
n = cutorch.getBlasHandle()
Returns the current blasHandle for all devices in use (as previously
set via cutorch.setBlasHandle(n).
*/
static int cutorch_getBlasHandle(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushnumber(L, THCState_getCurrentBlasHandleIndex(state));
return 1;
}
/*
Usage:
cutorch.setDefaultStream()
Equivalent to cutorch.setStream(0).
*/
static int cutorch_setDefaultStream(lua_State *L)
{
THCState *state = cutorch_getstate(L);
THCState_setStream(state, NULL);
return 0;
}
/*
Usage:
cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN})
for streams on the current device. Creates a one-way barrier where
waiterStream waits for waitForStream1-N to reach the current point.
*/
static int cutorch_streamWaitFor(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
/* Check that the waiting stream is in bounds; this will error out if not */
int waitingId = (int) luaL_checknumber(L, 1);
cudaStream_t streamWaiting =
THCState_getDeviceStream(state, curDev, waitingId);
/* Validate the streams that we are waiting on */
int streams = checkAndCountListOfStreams(L, state, 2, curDev);
if (streams < 1) {
/* nothing to synchronize */
return 0;
}
/* One-way dependency; streamWaiting will wait for the list of streams to
wait on to complete execution of pending scheduled kernels/events */
cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
createSingleDeviceEvents(L, state, 2, curDev, events);
/* Then, wait on them */
for (int i = 0; i < streams; i++) {
THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
THCudaCheck(cudaEventDestroy(events[i]));
}
free(events);
return 0;
}
/*
Usage:
cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter,
{[gpu1]={stream1_1, ..., stream1_N},
[gpuK]={streamK_1, ..., streamK_M}})
with a specified GPU per each list of streams.
Stream (gpuWaiter, streamWaiter) will wait on all of the other streams
(gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
(gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way
barrier only (only streamWaiter is blocked).
The streams to wait on are bucketed per device. Equivalent to
streamWaitFor() if only one GPU's streams are listed.
*/
static int cutorch_streamWaitForMultiDevice(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
/* Validate waiting (gpu, stream); this will error out if not */
int gpuWaiter = (int) luaL_checknumber(L, 1) - 1;
int streamWaiter = (int) luaL_checknumber(L, 2);
cudaStream_t streamWaiting =
THCState_getDeviceStream(state, gpuWaiter, streamWaiter);
/* Validate and count set of {gpu={streams...}} we are waiting on */
int gpus = 0;
int streams = 0;
checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams);
if (streams < 1) {
/* nothing to synchronize together */
return 0;
}
/*
Events can only be recorded on the same device on which they are created.
-For each GPU, create and record event per each stream given
for that GPU.
-For (gpuWaiter, streamWaiter), wait on all of the above events.
*/
cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);
/* First, create an event per GPU and record events for the specified stream
on that GPU */
createMultiDeviceEvents(L, state, 3, events);
/* Then, wait on the events */
THCudaCheck(cudaSetDevice(gpuWaiter));
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
}
/* Clean up events */
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaEventDestroy(events[i]));
}
free(events);
THCudaCheck(cudaSetDevice(prevDev));
return 0;
}
/*
Usage:
cutorch.streamBarrier({stream1, stream2, ..., streamN})
applies to streams for the current device. Creates a N-way barrier
to synchronize all of the streams given
*/
static int cutorch_streamBarrier(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
int streams = checkAndCountListOfStreams(L, state, 1, curDev);
if (streams < 2) {
/* nothing to synchronize together */
return 0;
}
/* Multi-way dependency (barrier); all streams must complete execution
of pending scheduled kernels/events */
cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
/* First, create an event and record them for all streams */
int eventsCreated = createSingleDeviceEvents(L, state, 1, curDev, events);
/* Then, wait on the event. Each stream is actually waiting on itself here
too, but that's harmless and isn't worth weeding out. */
waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated);
for (int i = 0; i < eventsCreated; i++)
THCudaCheck(cudaEventDestroy(events[i]));
free(events);
return 0;
}
/* usage:
cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N},
[gpuK]={streamK_1, ..., streamK_M}})
with a specified GPU per each list of streams.
Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
(gpuK, streamK_1), ..., (gpuK, streamK_M) will wait
for all others to complete fully.
Streams are bucketed per device. Equivalent to streamBarrier() if only
one GPU is specified.
*/
static int cutorch_streamBarrierMultiDevice(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
/* Validate and count set of {gpu={streams...}} that are mutually waiting */
int gpus = 0;
int streams = 0;
checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams);
if (streams < 2) {
/* nothing to synchronize together */
return 0;
}
/*
Events can only be recorded on the same device on which they are created.
-For each GPU, create an event, and record that event on each stream given
for that GPU.
-For each GPU, for each stream, wait on the event created by each other
GPU.
*/
cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);
/* First, create an event per GPU and record events for the specified stream
on that GPU */
createMultiDeviceEvents(L, state, 1, events);
/* Then, wait on the events. Each stream is actually waiting on itself here
too, but that's harmless and isn't worth weeding out. */
waitMultiDeviceEvents(L, state, 1, events, streams);
/* Clean up events */
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaEventDestroy(events[i]));
}
free(events);
THCudaCheck(cudaSetDevice(prevDev));
return 0;
}
/*
Usage:
cutorch.streamSynchronize(n)
For the current device, synchronizes with the given stream only
(cudaStreamSynchronize).
0 is the default stream on the device.
*/
static int cutorch_streamSynchronize(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int streamId = (int) luaL_checknumber(L, 1);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
/* This also validates the stream */
cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId);
THCudaCheck(cudaStreamSynchronize(stream));
return 0;
}
static int cutorch_getDevice(lua_State *L)
{
int device;
THCudaCheck(cudaGetDevice(&device));
device++;
lua_pushnumber(L, device);
return 1;
}
static int cutorch_deviceReset(lua_State *L)
{
printf("WARNING: cutorch.deviceReset has been depreceated."
" Just remove the call from your code.\n");
return 0;
}
static int cutorch_getDeviceCount(lua_State *L)
{
int ndevice;
THCudaCheck(cudaGetDeviceCount(&ndevice));
lua_pushnumber(L, ndevice);
return 1;
}
static int cutorch_getPeerToPeerAccess(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int dev = (int) luaL_checknumber(L, 1) - 1;
int devToAccess = (int) luaL_checknumber(L, 2) - 1;
/* device bounds checking is performed within */
int enabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
lua_pushboolean(L, enabled);
return 1;
}
static int cutorch_setPeerToPeerAccess(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int dev = (int) luaL_checknumber(L, 1) - 1;
int devToAccess = (int) luaL_checknumber(L, 2) - 1;
int enable = lua_toboolean(L, 3);
/* device bounds checking is performed within */
THCState_setPeerToPeerAccess(state, dev, devToAccess, enable);
return 0;
}
static int cutorch_getKernelPeerToPeerAccess(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushboolean(L, THCState_getKernelPeerToPeerAccessEnabled(state));
return 1;
}
static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int val = lua_toboolean(L, -1);
THCState_setKernelPeerToPeerAccessEnabled(state, val);
return 0;
}
static int cutorch_isCachingAllocatorEnabled(lua_State *L)
{
THCState *state = cutorch_getstate(L);
lua_pushboolean(L, THCState_isCachingAllocatorEnabled(state));
return 1;
}
static int cutorch_getMemoryUsage(lua_State *L) {
size_t freeBytes = 0;
size_t totalBytes = 0;
int curDevice;
THCudaCheck(cudaGetDevice(&curDevice));
THCState *state = cutorch_getstate(L);
int device = luaL_optint(L, 1, -10);
if (device == -10) { /* no argument passed, current device mem usage */
THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes));
} else { /* argument was given, particular device's memory usage */
THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */
THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes));
THCudaCheck(cudaSetDevice(curDevice));
}
lua_pushnumber(L, freeBytes);
lua_pushnumber(L, totalBytes);
return 2;
}
static int cutorch_setDevice(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int device = (int)luaL_checknumber(L, 1)-1;
THCudaCheck(cudaSetDevice(device));
return 0;
}
#define SET_DEVN_PROP(NAME) \
lua_pushnumber(L, prop.NAME); \
lua_setfield(L, -2, #NAME);
static int cutorch_getDeviceProperties(lua_State *L)
{
int device = (int)luaL_checknumber(L, 1)-1;
// switch context to given device so the call to cudaMemGetInfo is for the correct device
int oldDevice;
THCudaCheck(cudaGetDevice(&oldDevice));
THCudaCheck(cudaSetDevice(device));
struct cudaDeviceProp prop;
THCudaCheck(cudaGetDeviceProperties(&prop, device));
lua_newtable(L);
SET_DEVN_PROP(canMapHostMemory);
SET_DEVN_PROP(clockRate);
SET_DEVN_PROP(computeMode);
SET_DEVN_PROP(deviceOverlap);
SET_DEVN_PROP(integrated);
SET_DEVN_PROP(kernelExecTimeoutEnabled);
SET_DEVN_PROP(major);
SET_DEVN_PROP(maxThreadsPerBlock);
SET_DEVN_PROP(memPitch);
SET_DEVN_PROP(minor);
SET_DEVN_PROP(multiProcessorCount);
SET_DEVN_PROP(regsPerBlock);
SET_DEVN_PROP(sharedMemPerBlock);
SET_DEVN_PROP(textureAlignment);
SET_DEVN_PROP(totalConstMem);
SET_DEVN_PROP(totalGlobalMem);
SET_DEVN_PROP(warpSize);
SET_DEVN_PROP(pciBusID);
SET_DEVN_PROP(pciDeviceID);
SET_DEVN_PROP(pciDomainID);
SET_DEVN_PROP(maxTexture1D);
SET_DEVN_PROP(maxTexture1DLinear);
size_t freeMem;
THCudaCheck(cudaMemGetInfo (&freeMem, NULL));
lua_pushnumber(L, freeMem);
lua_setfield(L, -2, "freeGlobalMem");
lua_pushstring(L, prop.name);
lua_setfield(L, -2, "name");
// restore context
THCudaCheck(cudaSetDevice(oldDevice));
return 1;
}
static int cutorch_getRuntimeVersion(lua_State *L)
{
int version;
THCudaCheck(cudaRuntimeGetVersion(&version));
lua_pushnumber(L, version);
return 1;
}
static int cutorch_getDriverVersion(lua_State *L)
{
int version;
THCudaCheck(cudaDriverGetVersion(&version));
lua_pushnumber(L, version);
return 1;
}
static int cutorch_seed(lua_State *L)
{
unsigned long long seed = THCRandom_seed(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}
static int cutorch_seedAll(lua_State *L)
{
unsigned long long seed = THCRandom_seedAll(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}
static int cutorch_initialSeed(lua_State *L)
{
unsigned long long seed = THCRandom_initialSeed(cutorch_getstate(L));
lua_pushnumber(L, seed);
return 1;
}
static int cutorch_manualSeed(lua_State *L)
{
unsigned long long seed = luaL_checknumber(L, 1);
THCRandom_manualSeed(cutorch_getstate(L), seed);
return 0;
}
static int cutorch_manualSeedAll(lua_State* L)
{
unsigned long long seed = luaL_checknumber(L, 1);
THCRandom_manualSeedAll(cutorch_getstate(L), seed);
return 0;
}
static int cutorch_getRNGState(lua_State *L)
{
THByteTensor* t = THByteTensor_new();
THCRandom_getRNGState(cutorch_getstate(L), t);
luaT_pushudata(L, t, "torch.ByteTensor");
return 1;
}
static int cutorch_setRNGState(lua_State *L)
{
THByteTensor* t = luaT_checkudata(L, 1, "torch.ByteTensor");
THCRandom_setRNGState(cutorch_getstate(L), t);
return 0;
}
static int cutorch_getState(lua_State *L)
{
lua_getglobal(L, "cutorch");
lua_getfield(L, -1, "_state");
lua_remove(L, -2);
return 1;
}
static int cutorch_Event_new(lua_State *L)
{
cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t));
THCudaCheck(cudaEventCreate(event));
THCState *state = cutorch_getstate(L);
THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state)));
luaT_pushudata(L, event, "cutorch.Event");
return 1;
}
static int cutorch_Event_free(lua_State *L)
{
cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
THCudaCheck(cudaEventDestroy(*event));
luaT_free(L, event);
return 0;
}
static int cutorch_Event_waitOn(lua_State *L)
{
cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
THCState *state = cutorch_getstate(L);
THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0));
return 0;
}
static const struct luaL_Reg cutorch_Event__[] = {
{"waitOn", cutorch_Event_waitOn},
{NULL, NULL}
};
static void cutorch_Event_init(lua_State *L)
{
luaT_newmetatable(L, "cutorch.Event", NULL, cutorch_Event_new, cutorch_Event_free, NULL);
luaT_setfuncs(L, cutorch_Event__, 0);
lua_pop(L, 1);
}
static void luaCutorchGCFunction(void *data)
{
lua_State *L = data;
lua_gc(L, LUA_GCCOLLECT, 0);
}
static int cutorch_setHeapTracking(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int enabled = luaT_checkboolean(L,1);
if(enabled) {
THCSetGCHandler(state, luaCutorchGCFunction, L);
} else {
THCSetGCHandler(state, NULL, NULL);
}
return 0;
}
static int cutorch_isManagedPtr(lua_State *L)
{
THCState *state = cutorch_getstate(L);
if(lua_type(L, 1) != LUA_TNUMBER) {
THError("Must receive a ptr cast as a number");
}
void* ptr = (void* )luaL_optinteger(L, 1, 0);
struct cudaPointerAttributes attributes;
cudaError_t res = cudaPointerGetAttributes(&attributes, ptr);
if (res == cudaErrorInvalidValue) {
lua_pushboolean(L, 0);
} else {
THCudaCheck(res);
lua_pushboolean(L, attributes.isManaged);
}
return 1;
}
static int cutorch_shutdown(lua_State *L)
{
THCState **state = (THCState **) lua_topointer(L, 1);
THCudaShutdown(*state);
THCState_free(*state);
return 0;
}
static int cutorch_hasHalfInstructions(lua_State *L) {
THCState *state = cutorch_getstate(L);
#ifdef CUDA_HALF_TENSOR
lua_pushboolean(L, THC_nativeHalfInstructions(state));
#else
lua_pushboolean(L, 0);
#endif
return 1;
}
static int cutorch_hasFastHalfInstructions(lua_State *L) {
THCState *state = cutorch_getstate(L);
#ifdef CUDA_HALF_TENSOR
lua_pushboolean(L, THC_fastHalfInstructions(state));
#else
lua_pushboolean(L, 0);
#endif
return 1;
}
static int cutorch_sleep(lua_State *L) {
THCState *state = cutorch_getstate(L);
if (!luaT_checklong(L, 1)) {
THError("expected number 'cycles'");
}
THC_sleep(state, luaT_tolong(L, 1));
return 0;
}
static const struct luaL_Reg cutorch_stuff__ [] = {
{"synchronize", cutorch_synchronize},
{"synchronizeAll", cutorch_synchronizeAll},
{"reserveBlasHandles", cutorch_reserveBlasHandles},
{"getNumBlasHandles", cutorch_getNumBlasHandles},
{"setBlasHandle", cutorch_setBlasHandle},
{"getBlasHandle", cutorch_getBlasHandle},
{"reserveStreams", cutorch_reserveStreams},
{"getNumStreams", cutorch_getNumStreams},
{"setStream", cutorch_setStream},
{"getStream", cutorch_getStream},
{"setDefaultStream", cutorch_setDefaultStream},
{"streamWaitFor", cutorch_streamWaitFor},
{"streamWaitForMultiDevice", cutorch_streamWaitForMultiDevice},
{"streamBarrier", cutorch_streamBarrier},
{"streamBarrierMultiDevice", cutorch_streamBarrierMultiDevice},
{"streamSynchronize", cutorch_streamSynchronize},
{"getDevice", cutorch_getDevice},
{"deviceReset", cutorch_deviceReset},
{"getDeviceCount", cutorch_getDeviceCount},
{"getPeerToPeerAccess", cutorch_getPeerToPeerAccess},
{"setPeerToPeerAccess", cutorch_setPeerToPeerAccess},
{"setKernelPeerToPeerAccess", cutorch_setKernelPeerToPeerAccess},
{"getKernelPeerToPeerAccess", cutorch_getKernelPeerToPeerAccess},
{"isCachingAllocatorEnabled", cutorch_isCachingAllocatorEnabled},
{"getDeviceProperties", cutorch_getDeviceProperties},
{"getRuntimeVersion", cutorch_getRuntimeVersion},
{"getDriverVersion", cutorch_getDriverVersion},
{"getMemoryUsage", cutorch_getMemoryUsage},
{"hasHalfInstructions", cutorch_hasHalfInstructions},
{"hasFastHalfInstructions", cutorch_hasFastHalfInstructions},
{"setDevice", cutorch_setDevice},
{"seed", cutorch_seed},
{"seedAll", cutorch_seedAll},
{"initialSeed", cutorch_initialSeed},
{"manualSeed", cutorch_manualSeed},
{"manualSeedAll", cutorch_manualSeedAll},
{"_sleep", cutorch_sleep},
{"getRNGState", cutorch_getRNGState},
{"setRNGState", cutorch_setRNGState},
{"getState", cutorch_getState},
{"setHeapTracking", cutorch_setHeapTracking},
{"isManagedPtr", cutorch_isManagedPtr},
{NULL, NULL}
};
LUA_EXTERNC DLL_EXPORT int luaopen_libcutorch(lua_State *L);
int luaopen_libcutorch(lua_State *L)
{
lua_newtable(L);
lua_pushvalue(L, -1);
lua_setglobal(L, "cutorch");
luaL_setfuncs(L, cutorch_stuff__, 0);
THCState* state = THCState_alloc();
/* Enable the caching allocator unless THC_CACHING_ALLOCATOR=0 */
char* thc_caching_allocator = getenv("THC_CACHING_ALLOCATOR");
if (!thc_caching_allocator || strcmp(thc_caching_allocator, "0") != 0) {
THCState_setDeviceAllocator(state, THCCachingAllocator_get());
state->cudaHostAllocator = &THCCachingHostAllocator;
}
THCudaInit(state);
/* Register torch.CudaHostAllocator. */
luaT_pushudata(L, THCState_getCudaHostAllocator(state), "torch.Allocator");
lua_setfield(L, -2, "CudaHostAllocator");
/* Register torch.CudaUVAHostAllocator. */
luaT_pushudata(L, THCState_getCudaUVAAllocator(state), "torch.Allocator");
lua_setfield(L, -2, "CudaUVAAllocator");
#ifdef USE_MAGMA
THCMagma_init(state);
lua_pushboolean(L, 1);
lua_setfield(L, -2, "magma");
#endif
cutorch_CudaByteStorage_init(L);
cutorch_CudaCharStorage_init(L);
cutorch_CudaShortStorage_init(L);
cutorch_CudaIntStorage_init(L);
cutorch_CudaLongStorage_init(L);
cutorch_CudaStorage_init(L);
cutorch_CudaDoubleStorage_init(L);
#ifdef CUDA_HALF_TENSOR
cutorch_CudaHalfStorage_init(L);
#else
cutorch_HalfStorageCopy_init(L);
#endif
cutorch_CudaByteTensor_init(L);
cutorch_CudaCharTensor_init(L);
cutorch_CudaShortTensor_init(L);
cutorch_CudaIntTensor_init(L);
cutorch_CudaLongTensor_init(L);
cutorch_CudaTensor_init(L);
cutorch_CudaDoubleTensor_init(L);
#ifdef CUDA_HALF_TENSOR
cutorch_CudaHalfTensor_init(L);
#else
cutorch_HalfTensorCopy_init(L);
#endif
cutorch_CudaByteTensorOperator_init(L);
cutorch_CudaCharTensorOperator_init(L);
cutorch_CudaShortTensorOperator_init(L);
cutorch_CudaIntTensorOperator_init(L);
cutorch_CudaLongTensorOperator_init(L);
cutorch_CudaTensorOperator_init(L);
cutorch_CudaDoubleTensorOperator_init(L);
#ifdef CUDA_HALF_TENSOR
cutorch_CudaHalfTensorOperator_init(L);
#endif
cutorch_CudaByteTensorMath_init(L);
cutorch_CudaCharTensorMath_init(L);
cutorch_CudaShortTensorMath_init(L);
cutorch_CudaIntTensorMath_init(L);
cutorch_CudaLongTensorMath_init(L);
cutorch_CudaTensorMath_init(L);
cutorch_CudaDoubleTensorMath_init(L);
#ifdef CUDA_HALF_TENSOR
cutorch_CudaHalfTensorMath_init(L);
#endif
cutorch_Event_init(L);
/* Store state in cutorch table. */
lua_pushlightuserdata(L, state);
lua_setfield(L, -2, "_state");
#ifdef CUDA_HALF_TENSOR
lua_pushboolean(L, 1);
#else
lua_pushboolean(L, 0);
#endif
lua_setfield(L, -2, "hasHalf");
/* store gpu driver version in field */
int driverVersion;
THCudaCheck(cudaDriverGetVersion(&driverVersion));
lua_pushinteger(L, driverVersion);
lua_setfield(L, -2, "driverVersion");
/* when cutorch goes out of scope, we need to make sure THCState is properly
shut down (so that memory doesn not leak. Since _state is a lightuserdata
we cannot associate an __gc method with it. Hence, create a userdata, and
associate a metatable with it, which has an __gc method which properly
calls THCudaShutdown.
*/
/* create a new userdata type which is a pointer to a pointer */
THCState **thc_pointer = (THCState**)lua_newuserdata(L, sizeof(void*));
/* set the state pointer */
*thc_pointer = state;
/* create a table that will be used as the metatable */
lua_newtable(L);
/* push the gc function onto the stack */
lua_pushcfunction(L, &cutorch_shutdown);
/* set the __gc field in the table to the function (function is popped) */
lua_setfield(L, -2, "__gc");
/* now the table is on the top of the stack, and the userdata below it,
setmetatable on the userdata with the table. table is popped */
lua_setmetatable(L, -2);
/* now the userdata is on top, with the cutorch table below it,
set the field cutorch.__stategc to this userdata.
userdata is popped, leaving cutorch table on top of the stack */
lua_setfield(L, -2, "_stategc");
return 1;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/cutorch.git
git@gitee.com:Velcon-Zheng/cutorch.git
Velcon-Zheng
cutorch
cutorch
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385