From 5f5a691cc0ff3bf3800fb4e14112af56d4d1d3e4 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 10 Oct 2024 17:38:39 -0500
Subject: [PATCH] Fix doc build

---
 doc/array.rst  | 152 +++++++++++++++++++++++++------------------------
 doc/conf.py    |   5 ++
 doc/driver.rst |  32 +++++------
 doc/gl.rst     |  11 ++--
 doc/misc.rst   |  43 +++++++-------
 doc/util.rst   |  10 ++--
 6 files changed, 134 insertions(+), 119 deletions(-)

diff --git a/doc/array.rst b/doc/array.rst
index b10c3efc..3ece17dc 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -30,7 +30,7 @@ The :class:`GPUArray` Array Class
     of bytes to be allocated, returns an object that can be cast to an
     :class:`int` representing the address of the newly allocated memory.
     Observe that both :func:`pycuda.driver.mem_alloc` and
-    :meth:`pycuda.tools.DeviceMemoryPool.alloc` are a model of this interface.
+    :meth:`pycuda.tools.DeviceMemoryPool.allocate` are a model of this interface.
 
     All arguments beyond *allocator* should be considered keyword-only.
 
@@ -132,7 +132,7 @@ The :class:`GPUArray` Array Class
     .. method :: get(ary=None, pagelocked=False)
 
         Transfer the contents of *self* into *ary* or a newly allocated
-        :mod:`numpy.ndarray`. If *ary* is given, it must have the same
+        :class:`numpy.ndarray`. If *ary* is given, it must have the same
         shape and dtype. If it is not given,
         a *pagelocked* specifies whether the new array is allocated
         page-locked.
@@ -144,7 +144,7 @@ The :class:`GPUArray` Array Class
     .. method :: get_async(stream=None, ary=None)
 
         Transfer the contents of *self* into *ary* or a newly allocated
-        :mod:`numpy.ndarray`. If *ary* is given, it must have the right
+        :class:`numpy.ndarray`. If *ary* is given, it must have the right
         size (not necessarily shape) and dtype. If it is not given,
         a *page-locked* array is newly allocated.
 
@@ -159,20 +159,26 @@ The :class:`GPUArray` Array Class
         :meth:`pycuda.driver.Function.prepared_timed_call`.
 
     .. method :: __add__(other)
+    .. method :: __radd__(other)
     .. method :: __sub__(other)
+    .. method :: __rsub__(other)
     .. method :: __iadd__(other)
     .. method :: __isub__(other)
     .. method :: __neg__(other)
     .. method :: __mul__(other)
-    .. method :: __div__(other)
-    .. method :: __rdiv__(other)
+    .. method :: __rmul__(other)
+    .. method :: __truediv__(other)
+    .. method :: __rtruediv__(other)
     .. method :: __pow__(other)
+    .. method :: __rpow__(other)
 
     .. method :: __abs__()
 
         Return a :class:`GPUArray` containing the absolute value of each
         element of *self*.
 
+    .. method :: __getitem__(index)
+
     .. UNDOC reverse()
 
     .. method :: fill(scalar, stream=None)
@@ -182,7 +188,7 @@ The :class:`GPUArray` Array Class
     .. method :: astype(dtype, stream=None)
 
         Return *self*, cast to *dtype*.
-    
+
     .. method :: any(stream=None, allocator=None)
 
     .. method :: all(stream=None, allocator=None)
@@ -520,18 +526,18 @@ Quasirandom numbers are more expensive to generate.
 
 .. function:: seed_getter_uniform(N)
 
-    Return an :class:`GPUArray` filled with one random `int32` repeated `N`
+    Return an :class:`~pycuda.gpuarray.GPUArray` filled with one random `int32` repeated `N`
     times which can be used as a seed for XORWOW generator.
 
 .. function:: seed_getter_unique(N)
 
-    Return an :class:`GPUArray` filled with `N` random `int32` which can
+    Return an :class:`~pycuda.gpuarray.GPUArray` filled with `N` random `int32` which can
     be used as a seed for XORWOW generator.
 
 .. class:: XORWOWRandomNumberGenerator(seed_getter=None, offset=0)
 
     :arg seed_getter: a function that, given an integer count, will yield an
-      `int32` :class:`GPUArray` of seeds.
+      `int32` :class:`~pycuda.gpuarray.GPUArray` of seeds.
     :arg offset: Starting index into the XORWOW sequence, given seed.
 
     Provides pseudorandom numbers. Generates sequences with period
@@ -543,29 +549,29 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         pseudorandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         pseudorandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
         CUDA 4.0 and above.
@@ -574,7 +580,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
@@ -585,7 +591,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_poisson(data, lambda_value=None, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -602,7 +608,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -635,7 +641,7 @@ Quasirandom numbers are more expensive to generate.
 .. class:: MRG32k3aRandomNumberGenerator(seed_getter=None, offset=0)
 
     :arg seed_getter: a function that, given an integer count, will yield an
-      `int32` :class:`GPUArray` of seeds.
+      `int32` :class:`~pycuda.gpuarray.GPUArray` of seeds.
     :arg offset: Starting index into the XORWOW sequence, given seed.
 
     Provides pseudorandom numbers. Generates sequences with period
@@ -647,41 +653,41 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         pseudorandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         pseudorandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
 
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -698,7 +704,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -730,22 +736,22 @@ Quasirandom numbers are more expensive to generate.
 
 .. function:: generate_direction_vectors(count, direction=direction_vector_set.VECTOR_32)
 
-    Return an :class:`GPUArray` `count` filled with direction vectors
+    Return an :class:`~pycuda.gpuarray.GPUArray` `count` filled with direction vectors
     used to initialize Sobol generators.
 
 .. function:: generate_scramble_constants32(count)
 
-    Return a :class:`GPUArray` filled with `count' 32-bit unsigned integer
+    Return a :class:`~pycuda.gpuarray.GPUArray` filled with `count' 32-bit unsigned integer
     numbers used to initialize :class:`ScrambledSobol32RandomNumberGenerator`
 
 .. function:: generate_scramble_constants64(count)
 
-    Return a :class:`GPUArray` filled with `count' 64-bit unsigned integer
+    Return a :class:`~pycuda.gpuarray.GPUArray` filled with `count' 64-bit unsigned integer
     numbers used to initialize :class:`ScrambledSobol64RandomNumberGenerator`
 
 .. class:: Sobol32RandomNumberGenerator(dir_vector=None, offset=0)
 
-    :arg dir_vector: a :class:`GPUArray` of 32-element `int32` vectors which
+    :arg dir_vector: a :class:`~pycuda.gpuarray.GPUArray` of 32-element `int32` vectors which
       are used to initialize quasirandom generator; it must contain one vector
       for each initialized generator
     :arg offset: Starting index into the Sobol32 sequence, given direction
@@ -760,29 +766,29 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         quasirandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         quasirandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
         CUDA 4.0 and above.
@@ -791,7 +797,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
@@ -802,7 +808,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -819,7 +825,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -840,10 +846,10 @@ Quasirandom numbers are more expensive to generate.
 
 .. class:: ScrambledSobol32RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0)
 
-    :arg dir_vector: a :class:`GPUArray` of 32-element `uint32` vectors which
+    :arg dir_vector: a :class:`~pycuda.gpuarray.GPUArray` of 32-element `uint32` vectors which
       are used to initialize quasirandom generator; it must contain one vector
       for each initialized generator
-    :arg scramble_vector: a :class:`GPUArray` of `uint32` elements which
+    :arg scramble_vector: a :class:`~pycuda.gpuarray.GPUArray` of `uint32` elements which
       are used to initialize quasirandom generator; it must contain one number
       for each initialized generator
     :arg offset: Starting index into the Sobol32 sequence, given direction
@@ -858,29 +864,29 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         quasirandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         quasirandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
         CUDA 4.0 and above.
@@ -889,7 +895,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
@@ -900,7 +906,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -917,7 +923,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -938,7 +944,7 @@ Quasirandom numbers are more expensive to generate.
 
 .. class:: Sobol64RandomNumberGenerator(dir_vector=None, offset=0)
 
-    :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which
+    :arg dir_vector: a :class:`~pycuda.gpuarray.GPUArray` of 64-element `uint64` vectors which
       are used to initialize quasirandom generator; it must contain one vector
       for each initialized generator
     :arg offset: Starting index into the Sobol64 sequence, given direction
@@ -953,29 +959,29 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         quasirandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         quasirandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
         CUDA 4.0 and above.
@@ -984,7 +990,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
@@ -995,7 +1001,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -1012,7 +1018,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -1033,10 +1039,10 @@ Quasirandom numbers are more expensive to generate.
 
 .. class:: ScrambledSobol64RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0)
 
-    :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which
+    :arg dir_vector: a :class:`~pycuda.gpuarray.GPUArray` of 64-element `uint64` vectors which
       are used to initialize quasirandom generator; it must contain one vector
       for each initialized generator
-    :arg scramble_vector: a :class:`GPUArray` of `uint64` vectors which
+    :arg scramble_vector: a :class:`~pycuda.gpuarray.GPUArray` of `uint64` vectors which
       are used to initialize quasirandom generator; it must contain one vector
       for each initialized generator
     :arg offset: Starting index into the ScrambledSobol64 sequence,
@@ -1051,29 +1057,29 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_uniform(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with uniformly distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with uniformly distributed
         quasirandom values.
 
     .. method:: gen_uniform(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with uniformly distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_normal(data, stream=None)
 
-        Fills in :class:`GPUArray` *data* with normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with normally distributed
         quasirandom values.
 
     .. method:: gen_normal(shape, dtype, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with normally distributed pseudorandom values,
         and returns newly created object.
 
     .. method:: fill_log_normal(data, mean, stddev, stream=None)
 
-        Fills in :class:`GPUArray` *data* with log-normally distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with log-normally distributed
         pseudorandom values with mean *mean* and standard deviation *stddev*.
 
         CUDA 4.0 and above.
@@ -1082,7 +1088,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with log-normally distributed pseudorandom values
         with mean *mean* and standard deviation *stddev*, and returns
         newly created object.
@@ -1093,7 +1099,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
-        Fills in :class:`GPUArray` *data* with Poisson distributed
+        Fills in :class:`~pycuda.gpuarray.GPUArray` *data* with Poisson distributed
         pseudorandom values.
 
         If *lambda_value* is not None, it is used as lambda,
@@ -1110,7 +1116,7 @@ Quasirandom numbers are more expensive to generate.
 
     .. method:: gen_poisson(shape, dtype, lambda_value, stream=None)
 
-        Creates object of :class:`GPUArray` with given *shape* and *dtype*,
+        Creates object of :class:`~pycuda.gpuarray.GPUArray` with given *shape* and *dtype*,
         fills it in with Poisson distributed pseudorandom values
         with lambda *lambda_value*, and returns newly created object.
         *dtype* must be 32-bit unsigned int.
@@ -1134,7 +1140,7 @@ Single-pass Custom Expression Evaluation
 
 .. module:: pycuda.elementwise
 
-Evaluating involved expressions on :class:`GPUArray` instances can be
+Evaluating involved expressions on :class:`~pycuda.gpuarray.GPUArray` instances can be
 somewhat inefficient, because a new temporary is created for each
 intermediate result. The functionality in the module :mod:`pycuda.elementwise`
 contains tools to help generate kernels that evaluate multi-stage expressions
@@ -1160,7 +1166,7 @@ on one or several operands in a single pass.
     .. method:: __call__(*args, range=None, slice=None)
 
         Invoke the generated scalar kernel. The arguments may either be scalars or
-        :class:`GPUArray` instances.
+        :class:`~pycuda.gpuarray.GPUArray` instances.
 
         If *range* is given, it must be a :class:`slice` object and specifies
         the range of indices *i* for which the *operation* is carried out.
@@ -1231,13 +1237,13 @@ Custom Reductions
     .. method:: __call__(*args, stream=None, out=None)
 
         Invoke the generated reduction kernel. The arguments may either be scalars or
-        :class:`GPUArray` instances. The reduction will be done on each entry of
+        :class:`~pycuda.gpuarray.GPUArray` instances. The reduction will be done on each entry of
         the first vector argument.
 
         If *stream* is given, it must be a :class:`pycuda.driver.Stream` object,
         where the execution will be serialized.
 
-        With *out* the resulting single-entry :class:`GPUArray` can be specified.
+        With *out* the resulting single-entry :class:`~pycuda.gpuarray.GPUArray` can be specified.
         Because offsets are supported one can store results anywhere (e.g. out=a[3]).
 
 Here's a usage example::
@@ -1310,7 +1316,7 @@ know about them using this function:
 
 .. function:: pycuda.tools.register_dtype(dtype, name)
 
-    *dtype* is a :func:`numpy.dtype`.
+    *dtype* is a :class:`numpy.dtype`.
 
     .. versionadded: 2011.2
 
@@ -1319,4 +1325,4 @@ GPGPU Algorithms
 
 Bogdan Opanchuk's `reikna <http://pypi.python.org/pypi/reikna>`_ offers a
 variety of GPU-based algorithms (FFT, RNG, matrix multiplication) designed to work with
-:class:`pycuda.gpuarray.GPUArray` objects.
+:class:`~pycuda.gpuarray.GPUArray` objects.
diff --git a/doc/conf.py b/doc/conf.py
index af3dc36e..5796b12c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,4 +22,9 @@
     "python": ("https://docs.python.org/3", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "codepy": ("https://documen.tician.de/codepy/", None),
+    "pytest": ("https://docs.pytest.org/en/stable/", None),
 }
+
+nitpick_ignore_regex = [
+    ["py:class", r"numpy.(float32|u?int32)"],  # not sure why these don't work?
+]
diff --git a/doc/driver.rst b/doc/driver.rst
index 6e6d65f5..d1fb596a 100644
--- a/doc/driver.rst
+++ b/doc/driver.rst
@@ -69,7 +69,7 @@ Error Reporting
 .. exception:: LogicError
 
     Thrown when PyCuda was confronted with a situation where it is likely
-    that the programmer has made a mistake. :exc:`LogicErrors` do not depend
+    that the programmer has made a mistake. :exc:`LogicError`\ s do not depend
     on outer circumstances defined by the run-time environment.
 
     Example: CUDA was used before it was initialized.
@@ -709,7 +709,7 @@ Devices and Contexts
 
         Return the :class:`Context` obtained by retaining the device's
         primary context, which is the one used by the CUDA runtime API.
-        Unlike :meth:`Context.make_context`, the newly-created context is not made current.
+        Unlike :meth:`make_context`, the newly-created context is not made current.
 
         CUDA 7.0 and newer.
 
@@ -914,7 +914,7 @@ Global Device Memory
 .. function:: to_device(buffer)
 
     Allocate enough device memory for *buffer*, which adheres to the Python
-    :class:`buffer` interface. Copy the contents of *buffer* onto the device.
+    :ref:`python:bufferobjects` interface. Copy the contents of *buffer* onto the device.
     Return a :class:`DeviceAllocation` object representing the newly-allocated
     memory.
 
@@ -1003,7 +1003,7 @@ Global Device Memory
 
         If your subclass provides its own :meth:`!__init__`, it must call
         the base class :meth:`!__init__`. Failure to do so will lead to
-        :exc:`Boost.Python.ArgumentError` being raised when it is used.
+        ``boost.Python.ArgumentError`` being raised when it is used.
 
 .. _pagelocked_memory :
 
@@ -1494,7 +1494,7 @@ Arrays and Textures
 
 .. function:: gpuarray_to_array(gpuparray, order, allowSurfaceBind=False)
 
-    Turn a :class:`GPUArray` with 2D or 3D structure, into an
+    Turn a :class:`~pycuda.gpuarray.GPUArray` with 2D or 3D structure, into an
     :class:`Array`. Same structure and use of :func:`np_to_array`
 
     .. versionadded:: 2015.1
@@ -1833,7 +1833,7 @@ Code on the Device: Modules and Functions
         executing the kernel, depending on whether *time_kernel* is *True*.
 
         This is a convenience interface that can be used instead of the
-        :meth:`param_*` and :meth:`launch_*` methods below.  For a faster (but
+        ``param_*``` and ``launch_*``` methods below.  For a faster (but
         mildly less convenient) way of invoking kernels, see :meth:`prepare` and
         :meth:`prepared_call`.
 
@@ -1848,10 +1848,10 @@ Code on the Device: Modules and Functions
         * Instances of :class:`ArgumentHandler` subclasses. These can be used to
           automatically transfer :mod:`numpy` arrays onto and off of the device.
 
-        * Objects supporting the Python :class:`buffer` interface. These chunks
+        * Objects supporting the Python :ref:`python:bufferobjects` interface. These chunks
           of bytes will be copied into the parameter space verbatim.
 
-        * :class:`GPUArray` instances.
+        * :class:`~pycuda.gpuarray.GPUArray` instances.
 
         .. warning::
 
@@ -2052,17 +2052,17 @@ Code on the Device: Modules and Functions
 
 .. class:: In(array)
 
-    Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer`
+    Inherits from :class:`ArgumentHandler`. Indicates that :ref:`python:bufferobjects`
     *array* should be copied to the compute device before invoking the kernel.
 
 .. class:: Out(array)
 
-    Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer`
+    Inherits from :class:`ArgumentHandler`. Indicates that :ref:`python:bufferobjects`
     *array* should be copied off the compute device after invoking the kernel.
 
 .. class:: InOut(array)
 
-    Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer`
+    Inherits from :class:`ArgumentHandler`. Indicates that :ref:`python:bufferobjects`
     *array* should be copied both onto the compute device before invoking
     the kernel, and off it afterwards.
 
@@ -2101,7 +2101,7 @@ Just-in-time Compilation
     by the module :mod:`pycuda.gpuarray`.
 
     The initial value of this variable is taken from the environment variable
-    :envvar:`PYCUDA_DEFAULT_NVCC_FLAGS`.
+    ``PYCUDA_DEFAULT_NVCC_FLAGS``.
 
     If you modify this variable in your code, please be aware that this is a
     globally shared variable that may be modified by multiple packages. Please
@@ -2110,8 +2110,8 @@ Just-in-time Compilation
 
 .. class:: SourceModule(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[])
 
-    Create a :class:`Module` from the CUDA source code *source*. The Nvidia
-    compiler *nvcc* is assumed to be on the :envvar:`PATH` if no path to it is
+    Create a :class:`pycuda.driver.Module` from the CUDA source code *source*. The Nvidia
+    compiler *nvcc* is assumed to be on the ``PATH`` if no path to it is
     specified, and is invoked with *options* to compile the code. If *keep* is
     *True*, the compiler output directory is kept, and a line indicating its
     location in the file system is printed for debugging purposes.
@@ -2125,10 +2125,10 @@ Just-in-time Compilation
     If `code` is `None`, it will not be specified.
 
     `cache_dir` gives the directory used for compiler caching.  If `None`
-    then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or
+    then `cache_dir` is taken to be ``PYCUDA_CACHE_DIR`` if set or
     a sensible per-user default.  If passed as `False`, caching is disabled.
 
-    If the environment variable :envvar:`PYCUDA_DISABLE_CACHE` is set to
+    If the environment variable ``PYCUDA_DISABLE_CACHE`` is set to
     any value then caching is disabled.  This preference overrides any
     value of `cache_dir` and can be used to disable caching globally.
 
diff --git a/doc/gl.rst b/doc/gl.rst
index 494d520b..1a9f883f 100644
--- a/doc/gl.rst
+++ b/doc/gl.rst
@@ -88,7 +88,7 @@ Automatic Initialization
 
 .. warning ::
 
-    Importing :mod:`pycuda.gl.autoinit` will fail with a rather unhelpful error 
+    Importing :mod:`pycuda.gl.autoinit` will fail with a rather unhelpful error
     message if you don't already have a GL context created and active.
 
 .. data:: device
@@ -102,9 +102,10 @@ Old-style (pre-CUDA 3.0) API
     Enable GL interoperability for the already-created (so far non-GL)
     and currently active :class:`pycuda.driver.Context`.
 
-    According to the forum post referenced in the note below, this will succeed 
-    on Windows XP and Linux, but it will not work on Windows Vista. There you 
-    *have* to create the GL-enabled context using :func:`make_context`.
+    According to the forum post referenced in the note below, this will succeed
+    on Windows XP and Linux, but it will not work on Windows Vista. There you
+    *have* to create the GL-enabled context using
+    :meth:`~pycuda.driver.Device.make_context`.
 
     .. warning ::
 
@@ -112,7 +113,7 @@ Old-style (pre-CUDA 3.0) API
 
     .. warning ::
 
-        This will fail with a rather unhelpful error message if you don't already 
+        This will fail with a rather unhelpful error message if you don't already
         have a GL context created and active.
 
 .. note ::
diff --git a/doc/misc.rst b/doc/misc.rst
index 1db4ac10..eb84ed5e 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -17,19 +17,21 @@ Version 2018.1
 --------------
 
 * Update Boost.Python for better PyPy support
-* Add :meth:`pycuda.elementwise.ElementwiseKernel.get_texref`.
+* Add ``pycuda.elementwise.ElementwiseKernel.get_texref``.
 * Bug fixes.
 
 Version 2017.2
 --------------
 
-* :func:`zeros_like` and :func:`empty_like` now have  *dtype* and *order*
+* :func:`~pycuda.gpuarray.zeros_like` and
+  :func:`~pycuda.gpuarray.empty_like` now have  *dtype* and *order*
   arguments as in numpy.  Previously these routines always returned a
   C-order array.  The new default behavior follows the numpy default, which is
   to match the order and strides of the input as closely as possible.
-* A :func:`ones_like` gpuarray function was added.
-* methods :attr:`GPUArray.imag`, :attr:`GPUArray.real`, :meth:`GPUArray.conj`
-  now all return Fortran-ordered arrays when the :class:`GPUArray` is
+* A :func:`~pycuda.gpuarray.ones_like` gpuarray function was added.
+* methods :attr:`~pycuda.gpuarray.GPUArray.imag`, :attr:`~pycuda.gpuarray.GPUArray.real`,
+  :meth:`~pycuda.gpuarray.GPUArray.conj`
+  now all return Fortran-ordered arrays when the :class:`pycuda.gpuarray.GPUArray` is
   Fortran-ordered.
 
 Version 2016.2
@@ -50,8 +52,9 @@ Version 2016.1
 Version 2014.1
 --------------
 
-* Add :meth:`PointerHolderBase.as_buffer` and :meth:`DeviceAllocation.as_buffer`.
-* Support for :class:`device_attribute` values added in CUDA 5.0, 5.5, and 6.0.
+* Add :meth:`pycuda.driver.PointerHolderBase.as_buffer` and
+  :meth:`pycuda.driver.DeviceAllocation.as_buffer`.
+* Support for :class:`pycuda.driver.device_attribute` values added in CUDA 5.0, 5.5, and 6.0.
 * Support for :ref:`managed_memory`. (contributed by Stan Seibert)
 
 Version 2013.1.1
@@ -73,7 +76,7 @@ Version 2013.1
 
 .. note::
 
-    The addition of :meth:`pyopencl.array.Array.__getitem__` has an unintended
+    The addition of :meth:`pycuda.gpuarray.GPUArray.__getitem__` has an unintended
     consequence due to `numpy bug 3375
     <https://github.com/numpy/numpy/issues/3375>`_.  For instance, this
     expression::
@@ -111,7 +114,7 @@ Version 2011.2
 * Fix a memory leak when using pagelocked memory. (reported by Paul Cazeaux)
 * Fix complex scalar argument passing.
 * Fix :func:`pycuda.gpuarray.zeros` when used on complex arrays.
-* Add :func:`pycuda.tools.register_dtype` to enable scan/reduction on struct types.
+* Add ``pycuda.tools.register_dtype`` to enable scan/reduction on struct types.
 * More improvements to CURAND.
 * Add support for CUDA 4.1.
 
@@ -169,7 +172,7 @@ Version 0.94
 * Support for CUDA 3.2 RC.
   Search for "CUDA 3.2" in :ref:`reference-doc` to see what's new.
 * Add sparse matrix-vector multiplication and linear system solving code,
-  in :mod:`pycuda.sparse`.
+  in ``pycuda.sparse``.
 * Add :func:`pycuda.gpuarray.if_positive`, :func:`pycuda.gpuarray.maximum`,
   :func:`pycuda.gpuarray.minimum`.
 * Deprecate :func:`pycuda.tools.get_default_device`
@@ -178,16 +181,16 @@ Version 0.94
   which changes its behavior.
 * Remove previously deprecated features:
 
-  * :attr:`pycuda.driver.Function.registers`,
-    :attr:`pycuda.driver.Function.lmem`, and
-    :attr:`pycuda.driver.Function.smem` have been deprecated in favor of the
+  * ``pycuda.driver.Function.registers``,
+    ``pycuda.driver.Function.lmem``, and
+    ``pycuda.driver.Function.smem`` have been deprecated in favor of the
     mechanism above. See :attr:`pycuda.driver.Function.num_regs` for more.
   * the three-argument forms (i.e. with streams)
     of :func:`pycuda.driver.memcpy_dtoh` and
     :func:`pycuda.driver.memcpy_htod`. Use
     :func:`pycuda.driver.memcpy_dtoh_async`
     and :func:`pycuda.driver.memcpy_htod_async` instead.
-  * :class:`pycuda.driver.SourceModule`.
+  * :class:`pycuda.compiler.SourceModule`.
 
 * Add :func:`pycuda.tools.context_dependent_memoize`, use it for
   context-dependent caching of PyCUDA's canned kernels.
@@ -231,7 +234,7 @@ Version 0.93
   :class:`pycuda.driver.Stream`.  Asynchronous GPUArray transfers are
   now separate from synchronous ones and have an ``_async`` suffix.
 * Support for features added in CUDA 2.2.
-* :class:`pycuda.driver.SourceModule` has been moved to
+* ``pycuda.driver.SourceModule`` has been moved to
   :class:`pycuda.compiler.SourceModule`. It is still available by
   the old name, but will print a warning about the impending
   deprecation.
@@ -239,9 +242,9 @@ Version 0.93
   :class:`pycuda.driver.device_attribute` `attr` can now be spelled
   `dev.attr`, with no further namespace detours. (Suggested by Ian Cullinan)
   Likewise for :meth:`pycuda.driver.Function.get_attribute`
-* :attr:`pycuda.driver.Function.registers`,
-  :attr:`pycuda.driver.Function.lmem`, and
-  :attr:`pycuda.driver.Function.smem` have been deprecated in favor of the
+* ``pycuda.driver.Function.registers``,
+  ``pycuda.driver.Function.lmem``, and
+  ``pycuda.driver.Function.smem`` have been deprecated in favor of the
   mechanism above. See :attr:`pycuda.driver.Function.num_regs` for more.
 * Add PyCUDA version query mechanism, see :data:`pycuda.VERSION`.
 
@@ -272,7 +275,7 @@ Version 0.92
 * Automatically run Python GC before throwing out-of-memory errors.
 * Allow explicit release of memory using
   :meth:`pycuda.driver.DeviceAllocation.free`,
-  :meth:`pycuda.driver.HostAllocation.free`,
+  :meth:`pycuda.driver.PagelockedHostAllocation.free`,
   :meth:`pycuda.driver.Array.free`,
   :meth:`pycuda.tools.PooledDeviceAllocation.free`,
   :meth:`pycuda.tools.PooledHostAllocation.free`.
@@ -309,7 +312,7 @@ Version 0.91
 * :class:`pycuda.gpuarray.GPUArray` parallelizes properly on
   GTX200-generation devices.
 * Make :class:`pycuda.driver.Function` resource usage available
-  to the program. (See, e.g. :attr:`pycuda.driver.Function.registers`.)
+  to the program. (See, e.g. ``pycuda.driver.Function.registers``.)
 * Cache kernels compiled by :class:`pycuda.compiler.SourceModule`.
   (Tom Annau)
 * Allow for faster, prepared kernel invocation.
diff --git a/doc/util.rst b/doc/util.rst
index c7998994..50a072c5 100644
--- a/doc/util.rst
+++ b/doc/util.rst
@@ -38,7 +38,7 @@ Choice of Device
   Return a :class:`pycuda.driver.Context` instance chosen according to the
   following rules:
 
-   * If the environment variable :envvar:`CUDA_DEVICE` is set, its integer
+   * If the environment variable ``CUDA_DEVICE`` is set, its integer
      value is used as the device number.
 
    * If the file :file:`.cuda-device` is present in the user's home directory,
@@ -57,7 +57,7 @@ Choice of Device
   Return a :class:`pycuda.driver.Device` instance chosen according to the
   following rules:
 
-   * If the environment variable :envvar:`CUDA_DEVICE` is set, its integer
+   * If the environment variable ``CUDA_DEVICE`` is set, its integer
      value is used as the device number.
 
    * If the file :file:`.cuda-device` is present in the user's home directory,
@@ -90,7 +90,7 @@ Testing
 
 .. function:: mark_cuda_test(func)
 
-    This function, meant for use with :mod:`py.test`, will mark *func* with a
+    This function, meant for use with :mod:`pytest`, will mark *func* with a
     "cuda" tag and make sure it has a CUDA context available when invoked.
 
 
@@ -199,8 +199,8 @@ Device-based Memory Pool
     An object representing a :class:`DeviceMemoryPool`-based allocation of
     linear device memory.  Once this object is deleted, its associated device
     memory is freed.
-    :class:`PooledDeviceAllocation` instances can be cast to :class:`int`
-    (and :class:`long`), yielding the starting address of the device memory
+    :class:`PooledDeviceAllocation` instances can be cast to :class:`int`,
+    yielding the starting address of the device memory
     allocated.
 
     .. method:: free