from __future__ import absolute_import, print_function, division import theano from theano import (config, gof) import theano.tensor as T from .basic_ops import (gpu_contiguous, as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir) import theano.tensor.nnet.ctc from .type import (GpuArrayType, gpu_context_type) from .elemwise import GpuDimShuffle from theano.gradient import grad_undefined from theano.gof import local_optimizer from theano.tensor.opt import register_canonicalize from theano.tensor.nnet.ctc import ctc_available import os import sys from . import pygpu class GpuConnectionistTemporalClassification(gof.COp): """ GPU wrapper for Baidu CTC loss function. Parameters ---------- compute_grad If set to True, enables the computation of gradients of the CTC loss function. """ __props__ = ('compute_grad',) _cop_num_inputs = 3 _cop_num_outputs = 2 func_file = "./c_code/ctc_wrapper.c" func_name = "APPLY_SPECIFIC(ctc_cost_gpu)" params_type = gpu_context_type def __init__(self, compute_grad=True): if not ctc_available(): raise RuntimeError('Baidu CTC is not available and ' 'GpuConnectionistTemporalClassification Op ' 'can not be constructed.') self.compute_grad = compute_grad # Return only the cost. Gradient will be returned by grad() self.default_output = 0 gof.COp.__init__(self, self.func_file, self.func_name) def c_lib_dirs(self): lib_dirs = [] if ctc_available.path is not None: lib_dirs += [ctc_available.path] return lib_dirs def c_compile_args(self): if ctc_available.path is not None: if sys.platform != 'darwin' and ' ' in ctc_available.path: return ['-Wl,-rpath,"' + ctc_available.path + '"'] else: return ['-Wl,-rpath,' + ctc_available.path] return [] def c_libraries(self): return ["warpctc", "gpuarray"] def c_header_dirs(self): dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(), config.cuda.include_path] if config.ctc.root != '': dirs.append(os.path.join(config.ctc.root, "include")) return dirs def c_headers(self): return ['ctc.h', 'numpy_compat.h', 'gpuarray/ext_cuda.h', 'gpuarray_helper.h', 'gpuarray/types.h', 'gpuarray_api.h', 'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/extension.h'] def get_params(self, node): return node.inputs[0].type.context def make_node(self, activations, labels, input_lengths): context_name = infer_context_name(activations) t_activations = as_gpuarray_variable(activations, context_name=context_name) # Ensure activations array is C-contiguous t_activations = gpu_contiguous(t_activations) # Labels and input lengths are always on the CPU t_labels = T.as_tensor_variable(labels) t_input_lengths = T.as_tensor_variable(input_lengths) if t_activations.type.dtype != 'float32': raise TypeError('activations must use the float32 type.') if t_activations.ndim != 3: raise ValueError('activations must have 3 dimensions.') if t_labels.type.dtype != 'int32': raise TypeError('labels must use the int32 type.') if t_labels.ndim != 2: raise ValueError('labels must have 2 dimensions.') if t_input_lengths.type.dtype != 'int32': raise TypeError('input_lengths must use the int32 type.') if t_input_lengths.ndim != 1: raise ValueError('input_lengths must have 1 dimension.') costs = GpuArrayType(dtype='float32', broadcastable=(False,), context_name=context_name)() outputs = [costs] if self.compute_grad: gradients = GpuArrayType(dtype='float32', broadcastable=(False, False, False,), context_name=context_name)() outputs += [gradients] return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs) def L_op(self, inputs, outputs, output_grads): # Gradients computed by Op assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None # Gradients of original function, to compose chain rule grad_op = output_grads[0] grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,), new_order=(1, 0, 2))(gradients) grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle) grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,), new_order=(1, 0, 2))(grad_bdot) return [grad_shuffle_reverse, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2])] def gpu_ctc(activations, labels, input_lengths): """ Compute CTC loss function on the GPU. Parameters ---------- activations Three-dimensional tensor, which has a shape of (t, m, p), where t is the time index, m is the minibatch index, and p is the index over the probabilities of each symbol in the alphabet. The memory layout is assumed to be in C-order, which consists in the slowest to the fastest changing dimension, from left to right. In this case, p is the fastest changing dimension. labels A 2-D tensor of all the labels for the minibatch. In each row, there is a sequence of target labels. Negative values are assumed to be padding, and thus are ignored. Blank symbol is assumed to have index 0 in the alphabet. input_lengths A 1-D tensor with the number of time steps for each sequence in the minibatch. Returns ------- 1-D array Cost of each example in the minibatch. """ return GpuConnectionistTemporalClassification()(activations, labels, input_lengths) # Disable gradient computation if not needed @register_canonicalize("fast_compile") @local_optimizer([GpuConnectionistTemporalClassification]) def local_gpu_ctc_no_grad(node): if isinstance(node.op, GpuConnectionistTemporalClassification): if len(node.outputs) > 1: if len(node.outputs[1].clients) == 0: # gradient is not used return [GpuConnectionistTemporalClassification(compute_grad=False)(*node.inputs), None] return False