#include "openmm/common/windowsExportCommon.h"
#include "OpenCLArray.h"
#include "OpenCLBondedUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLPlatform.h"
#include "openmm/common/ComputeContext.h"
namespace OpenMM {
class OpenCLForceInfo;
/**
* These are a few extra vector types beyond the ones in ComputeVectorTypes.h.
*/
struct mm_float8 {
cl_float s0, s1, s2, s3, s4, s5, s6, s7;
mm_float8() {
}
mm_float8(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) {
}
};
struct mm_float16 {
cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
mm_float16() {
}
mm_float16(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7,
cl_float s8, cl_float s9, cl_float s10, cl_float s11, cl_float s12, cl_float s13, cl_float s14, cl_float s15) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7),
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
}
};
struct mm_ushort2 {
cl_ushort x, y;
mm_ushort2() {
}
mm_ushort2(cl_ushort x, cl_ushort y) : x(x), y(y) {
}
};
struct mm_int8 {
cl_int s0, s1, s2, s3, s4, s5, s6, s7;
mm_int8() {
}
mm_int8(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) {
}
};
struct mm_int16 {
cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
mm_int16() {
}
mm_int16(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7,
cl_int s8, cl_int s9, cl_int s10, cl_int s11, cl_int s12, cl_int s13, cl_int s14, cl_int s15) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7),
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
}
};
/**
* This class contains the information associated with a Context by the OpenCL Platform. Each OpenCLContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate OpenCLContext for each one. The list of all contexts is
* stored in the OpenCLPlatform::PlatformData.
*
* In addition, a worker thread is created for each OpenCLContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread.
*/
class OPENMM_EXPORT_COMMON OpenCLContext : public ComputeContext {
public:
class WorkTask;
class ReorderListener;
class ForcePreComputation;
class ForcePostComputation;
static const int ThreadBlockSize;
static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData,
OpenCLContext* originalContext);
~OpenCLContext();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void initialize();
/**
* Add an ComputeForceInfo to this context.
*/
void addForce(ComputeForceInfo* force);
/**
* Request that the context provide at least a particular number of force buffers.
* Force kernels should call this during initialization.
*/
void requestForceBuffers(int minBuffers);
/**
* Get the cl::Context associated with this object.
*/
cl::Context& getContext() {
return context;
}
/**
* Get the cl::Device associated with this object.
*/
cl::Device& getDevice() {
return device;
}
/**
* Get the index of the cl::Device associated with this object.
*/
int getDeviceIndex() {
return deviceIndex;
}
/**
* Get the index of the cl::Platform associated with this object.
*/
int getPlatformIndex() {
return platformIndex;
}
/**
* Get the PlatformData object this context is part of.
*/
OpenCLPlatform::PlatformData& getPlatformData() {
return platformData;
}
/**
* Get the number of contexts being used for the current simulation.
* This is relevant when a simulation is parallelized across multiple devices. In that case,
* one OpenCLContext is created for each device.
*/
int getNumContexts() const {
return platformData.contexts.size();
}
/**
* Get the index of this context in the list stored in the PlatformData.
*/
int getContextIndex() const {
return contextIndex;
}
/**
* Get the cl::CommandQueue currently being used for execution.
*/
cl::CommandQueue& getQueue();
/**
* Set the cl::ComandQueue to use for execution.
*/
void setQueue(cl::CommandQueue& queue);
/**
* Reset the context to using the default queue for execution.
*/
void restoreDefaultQueue();
/**
* Construct an uninitialized array of the appropriate class for this platform. The returned
* value should be created on the heap with the "new" operator.
*/
OpenCLArray* createArray();
/**
* Construct a ComputeEvent object of the appropriate class for this platform.
*/
ComputeEvent createEvent();
/**
* Compile source code to create a ComputeProgram.
*
* @param source the source code of the program
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
*/
ComputeProgram compileProgram(const std::string source, const std::map& defines=std::map());
/**
* Convert an array to an OpenCLArray. If the argument is already an OpenCLArray, this simply casts it.
* If the argument is a ComputeArray that wraps an OpenCLArray, this returns the wrapped array. For any
* other argument, this throws an exception.
*/
OpenCLArray& unwrap(ArrayInterface& array) const;
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
OpenCLArray& getPosq() {
return posq;
}
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray& getPosqCorrection() {
return posqCorrection;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
OpenCLArray& getVelm() {
return velm;
}
/**
* Get the array which contains the force on each atom.
*/
OpenCLArray& getForce() {
return force;
}
/**
* Get the array which contains the buffers in which forces are computed.
*/
OpenCLArray& getForceBuffers() {
return forceBuffers;
}
/**
* Get the array which contains a contribution to each force represented as 64 bit fixed point.
*/
OpenCLArray& getLongForceBuffer() {
return longForceBuffer;
}
/**
* Get the array which contains the buffer in which energy is computed.
*/
OpenCLArray& getEnergyBuffer() {
return energyBuffer;
}
/**
* Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
*/
OpenCLArray& getEnergyParamDerivBuffer() {
return energyParamDerivBuffer;
}
/**
* Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
* This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
*/
void* getPinnedBuffer() {
return pinnedMemory;
}
/**
* Get a shared ThreadPool that code can use to parallelize operations.
*
* Because this object is freely available to all code, care is needed to avoid conflicts. Only use it
* from the main thread, and make sure all operations are complete before you invoke any other code that
* might make use of it
*/
ThreadPool& getThreadPool() {
return getPlatformData().threads;
}
/**
* Get the array which contains the index of each atom.
*/
OpenCLArray& getAtomIndexArray() {
return atomIndexDevice;
}
/**
* Create an OpenCL Program from source code.
*
* @param source the source code of the program
* @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is
* omitted, a default set of options will be used
*/
cl::Program createProgram(const std::string source, const char* optimizationFlags = NULL);
/**
* Create an OpenCL Program from source code.
*
* @param source the source code of the program
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is
* omitted, a default set of options will be used
*/
cl::Program createProgram(const std::string source, const std::map& defines, const char* optimizationFlags = NULL);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used
* @param blockSize the size of each thread block to use
*/
void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
/**
* Set all elements of an array to 0.
*/
void clearBuffer(ArrayInterface& array);
/**
* Set all elements of an array to 0.
*
* @param memory the Memory to clear
* @param size the size of the buffer in bytes
*/
void clearBuffer(cl::Memory& memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*/
void addAutoclearBuffer(ArrayInterface& array);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the Memory to clear
* @param size the size of the buffer in bytes
*/
void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Clear all buffers that have been registered with addAutoclearBuffer().
*/
void clearAutoclearBuffers();
/**
* Given a collection of floating point buffers packed into an array, sum them and store
* the sum in the first buffer.
*
* @param array the array containing the buffers to reduce
* @param numBuffers the number of buffers packed into the array
*/
void reduceBuffer(OpenCLArray& array, int numBuffers);
/**
* Sum the buffers containing forces.
*/
void reduceForces();
/**
* Sum the buffer containing energy.
*/
double reduceEnergy();
/**
* Get the current simulation time.
*/
double getTime() {
return time;
}
/**
* Set the current simulation time.
*/
void setTime(double t) {
time = t;
}
/**
* Get the number of integration steps that have been taken.
*/
int getStepCount() {
return stepCount;
}
/**
* Set the number of integration steps that have been taken.
*/
void setStepCount(int steps) {
stepCount = steps;
}
/**
* Get the number of times forces or energy has been computed.
*/
int getComputeForceCount() {
return computeForceCount;
}
/**
* Set the number of times forces or energy has been computed.
*/
void setComputeForceCount(int count) {
computeForceCount = count;
}
/**
* Get the number of time steps since the atoms were reordered.
*/
int getStepsSinceReorder() const {
return stepsSinceReorder;
}
/**
* Set the number of time steps since the atoms were reordered.
*/
void setStepsSinceReorder(int steps) {
stepsSinceReorder = steps;
}
/**
* Get the flag that marks whether the current force evaluation is valid.
*/
bool getForcesValid() const {
return forcesValid;
}
/**
* Get the flag that marks whether the current force evaluation is valid.
*/
void setForcesValid(bool valid) {
forcesValid = valid;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int getNumAtomBlocks() const {
return numAtomBlocks;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int getNumThreadBlocks() const {
return numThreadBlocks;
}
/**
* Get the maximum number of threads in a thread block supported by this device.
*/
int getMaxThreadBlockSize() const {
return device.getInfo();
}
/**
* Get the number of force buffers.
*/
int getNumForceBuffers() const {
return numForceBuffers;
}
/**
* Get whether the device being used is a CPU. In some cases, different algorithms
* may be more efficient on CPUs and GPUs.
*/
bool getIsCPU() const {
return (device.getInfo() == CL_DEVICE_TYPE_CPU);
}
/**
* Get the SIMD width of the device being used.
*/
int getSIMDWidth() const {
return simdWidth;
}
/**
* Get whether the device being used supports 64 bit atomic operations on global memory.
*/
bool getSupports64BitGlobalAtomics() const {
return supports64BitGlobalAtomics;
}
/**
* Get whether the device being used supports double precision math.
*/
bool getSupportsDoublePrecision() const {
return supportsDoublePrecision;
}
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() const {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() const {
return useMixedPrecision;
}
/**
* Get whether the periodic box is triclinic.
*/
bool getBoxIsTriclinic() const {
return boxIsTriclinic;
}
/**
* Get the vectors defining the periodic box.
*/
void getPeriodicBoxVectors(Vec3& a, Vec3& b, Vec3& c) const {
a = Vec3(periodicBoxVecXDouble.x, periodicBoxVecXDouble.y, periodicBoxVecXDouble.z);
b = Vec3(periodicBoxVecYDouble.x, periodicBoxVecYDouble.y, periodicBoxVecYDouble.z);
c = Vec3(periodicBoxVecZDouble.x, periodicBoxVecZDouble.y, periodicBoxVecZDouble.z);
}
/**
* Set the vectors defining the periodic box.
*/
void setPeriodicBoxVectors(const Vec3& a, const Vec3& b, const Vec3& c) {
periodicBoxVecX = mm_float4((float) a[0], (float) a[1], (float) a[2], 0.0f);
periodicBoxVecY = mm_float4((float) b[0], (float) b[1], (float) b[2], 0.0f);
periodicBoxVecZ = mm_float4((float) c[0], (float) c[1], (float) c[2], 0.0f);
periodicBoxVecXDouble = mm_double4(a[0], a[1], a[2], 0.0);
periodicBoxVecYDouble = mm_double4(b[0], b[1], b[2], 0.0);
periodicBoxVecZDouble = mm_double4(c[0], c[1], c[2], 0.0);
periodicBoxSize = mm_float4((float) a[0], (float) b[1], (float) c[2], 0.0f);
invPeriodicBoxSize = mm_float4(1.0f/(float) a[0], 1.0f/(float) b[1], 1.0f/(float) c[2], 0.0f);
periodicBoxSizeDouble = mm_double4(a[0], b[1], c[2], 0.0);
invPeriodicBoxSizeDouble = mm_double4(1.0/a[0], 1.0/b[1], 1.0/c[2], 0.0);
}
/**
* Get the size of the periodic box.
*/
mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize;
}
/**
* Get the size of the periodic box.
*/
mm_double4 getPeriodicBoxSizeDouble() const {
return periodicBoxSizeDouble;
}
/**
* Get the inverse of the size of the periodic box.
*/
mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize;
}
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4 getInvPeriodicBoxSizeDouble() const {
return invPeriodicBoxSizeDouble;
}
/**
* Get the first periodic box vector.
*/
mm_float4 getPeriodicBoxVecX() {
return periodicBoxVecX;
}
/**
* Get the first periodic box vector.
*/
mm_double4 getPeriodicBoxVecXDouble() {
return periodicBoxVecXDouble;
}
/**
* Get the second periodic box vector.
*/
mm_float4 getPeriodicBoxVecY() {
return periodicBoxVecY;
}
/**
* Get the second periodic box vector.
*/
mm_double4 getPeriodicBoxVecYDouble() {
return periodicBoxVecYDouble;
}
/**
* Get the third periodic box vector.
*/
mm_float4 getPeriodicBoxVecZ() {
return periodicBoxVecZ;
}
/**
* Get the third periodic box vector.
*/
mm_double4 getPeriodicBoxVecZDouble() {
return periodicBoxVecZDouble;
}
/**
* Get the OpenCLIntegrationUtilities for this context.
*/
OpenCLIntegrationUtilities& getIntegrationUtilities() {
return *integration;
}
/**
* Get the OpenCLExpressionUtilities for this context.
*/
OpenCLExpressionUtilities& getExpressionUtilities() {
return *expression;
}
/**
* Get the OpenCLBondedUtilities for this context.
*/
OpenCLBondedUtilities& getBondedUtilities() {
return *bonded;
}
/**
* Get the OpenCLNonbondedUtilities for this context.
*/
OpenCLNonbondedUtilities& getNonbondedUtilities() {
return *nonbonded;
}
/**
* This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized.
*/
void initializeContexts();
/**
* Set the particle charges. These are packed into the fourth element of the posq array.
*/
void setCharges(const std::vector& charges);
/**
* Request to use the fourth element of the posq array for storing charges. Since only one force can
* do that, this returns true the first time it is called, and false on all subsequent calls.
*/
bool requestPosqCharges();
/**
* Get the names of all parameters with respect to which energy derivatives are computed.
*/
const std::vector& getEnergyParamDerivNames() const {
return energyParamDerivNames;
}
/**
* Get a workspace data structure used for accumulating the values of derivatives of the energy
* with respect to parameters.
*/
std::map& getEnergyParamDerivWorkspace() {
return energyParamDerivWorkspace;
}
/**
* Register that the derivative of potential energy with respect to a context parameter
* will need to be calculated. If this is called multiple times for a single parameter,
* it is only added to the list once.
*
* @param param the name of the parameter to add
*/
void addEnergyParameterDerivative(const std::string& param);
/**
* Wait until all work that has been queued (kernel executions, asynchronous data transfers, etc.)
* has been submitted to the device. This does not mean it has necessarily been completed.
* Calling this periodically may improve the responsiveness of the computer's GUI, but at the
* expense of reduced simulation performance.
*/
void flushQueue();
private:
OpenCLPlatform::PlatformData& platformData;
int deviceIndex;
int platformIndex;
int contextIndex;
int numAtomBlocks;
int numThreadBlocks;
int numForceBuffers;
int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, boxIsTriclinic, hasAssignedPosqCharges;
mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ;
mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble;
std::string defaultOptimizationOptions;
std::map compilationDefines;
cl::Context context;
cl::Device device;
cl::CommandQueue defaultQueue, currentQueue;
cl::Kernel clearBufferKernel;
cl::Kernel clearTwoBuffersKernel;
cl::Kernel clearThreeBuffersKernel;
cl::Kernel clearFourBuffersKernel;
cl::Kernel clearFiveBuffersKernel;
cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceReal4Kernel;
cl::Kernel reduceForcesKernel;
cl::Kernel reduceEnergyKernel;
cl::Kernel setChargesKernel;
cl::Buffer* pinnedBuffer;
void* pinnedMemory;
OpenCLArray posq;
OpenCLArray posqCorrection;
OpenCLArray velm;
OpenCLArray force;
OpenCLArray forceBuffers;
OpenCLArray longForceBuffer;
OpenCLArray energyBuffer;
OpenCLArray energySum;
OpenCLArray energyParamDerivBuffer;
OpenCLArray atomIndexDevice;
OpenCLArray chargeBuffer;
std::vector energyParamDerivNames;
std::map energyParamDerivWorkspace;
std::vector autoclearBuffers;
std::vector autoclearBufferSizes;
OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded;
OpenCLNonbondedUtilities* nonbonded;
};
/**
* This class exists only for backward compatibility. Use ComputeContext::WorkTask instead.
*/
class OPENMM_EXPORT_COMMON OpenCLContext::WorkTask : public ComputeContext::WorkTask {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ReorderListener instead.
*/
class OPENMM_EXPORT_COMMON OpenCLContext::ReorderListener : public ComputeContext::ReorderListener {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ForcePreComputation instead.
*/
class OPENMM_EXPORT_COMMON OpenCLContext::ForcePreComputation : public ComputeContext::ForcePreComputation {
};
/**
* This class exists only for backward compatibility. Use ComputeContext::ForcePostComputation instead.
*/
class OPENMM_EXPORT_COMMON OpenCLContext::ForcePostComputation : public ComputeContext::ForcePostComputation {
};
} // namespace OpenMM
#endif /*OPENMM_OPENCLCONTEXT_H_*/