1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
//! Ready-to-Code]][gitpod]
//!
//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
//!
//! [MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
//! [repo]: https://github.com/juntyr/rust-cuda
//!
//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_kernel/
//!
//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
//!
//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
//!
//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
//!
//! `rust-cuda-kernel` provides the [`#[kernel]`](macro@kernel) attribute
//! macro. When applied to a function, it compiles it as a CUDA kernel that
//! can be *safely* called from Rust code on the host.

#![deny(clippy::complexity)]
#![deny(clippy::correctness)]
#![warn(clippy::nursery)]
#![warn(clippy::pedantic)]
#![deny(clippy::perf)]
#![deny(clippy::style)]
#![deny(clippy::suspicious)]
#![deny(unsafe_code)]
#![warn(missing_docs)]
#![feature(box_patterns)]
#![feature(proc_macro_tracked_env)]
#![feature(proc_macro_span)]
#![feature(let_chains)]
#![feature(map_try_insert)]
#![feature(proc_macro_def_site)]
#![feature(cfg_version)]
#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]

extern crate proc_macro;

#[macro_use]
extern crate proc_macro_error2;

use proc_macro::TokenStream;

mod kernel;

#[proc_macro_error]
#[proc_macro_attribute]
/// Provides the [`#[kernel]`](macro@kernel) attribute macro. When applied to a
/// function, it compiles it as a CUDA kernel that can be *safely* called from
/// Rust code on the host.
///
/// The annotated function must be public, not const, not async, not have an
/// explicit ABI, not be variadic, not have a receiver (e.g. `&self`), and
/// return the unit type `()`. At the moment, the kernel function must also
/// not use a where clause – use type generic bounds instead.
///
/// While the [`#[kernel]`](macro@kernel) attribute supports functions with any
/// number of arguments, [`rust_cuda::kernel::TypedPtxKernel`] only supports
/// launching kernels with up to 12 parameters at the moment.
///
/// The [`#[kernel]`](macro@kernel) attribute uses the following syntax:
///
/// ```rust,ignore
/// #[kernel(pub? use link! for impl)]
/// fn my_kernel(/* parameters */) {
///     /* kernel code */
/// }
/// ```
///
/// where `link` is the name of a macro that will be generated to manually link
/// specific monomorphised instantiations of the (optionally generic) kernel
/// function, and the optional `pub` controls whether this macro is public or
/// private.
///
/// Note that all kernel parameters must implement the sealed
/// [`rust_cuda::kernel::CudaKernelParameter`] trait.
///
/// To use a specific monomorphised instantiation of the kernel, the generated
/// `link!` macro must be invoked with the following syntax:
///
/// ```rust,ignore
/// struct KernelPtx;
/// link! { impl my_kernel for KernelPtx }
/// ```
/// for the non-generic kernel function `my_kernel` and a non-generic marker
/// type `KernelPtx`, which can be used as the generic `Kernel` type parameter
/// for [`rust_cuda::kernel::TypedPtxKernel`] to instantiate and launch the
/// kernel. Specifically, the [`rust_cuda::kernel::CompiledKernelPtx`] trait is
/// implemented for the `KernelPtx` type.
///
/// If the kernel function is generic, the following syntax is used instead:
/// ```rust,ignore
/// #[kernel(pub? use link! for impl)]
/// fn my_kernel<'a, A, B: Bounded, const N: usize>(/* parameters */) {
///     /* kernel code */
/// }
///
/// struct KernelPtx<'a, A, B: Bounded, const N: usize>(/* ... */);
/// link! { impl my_kernel<'a, u32, MyStruct, 42> for KernelPtx }
/// link! { impl my_kernel<'a, bool, MyOtherStruct, 24> for KernelPtx }
/// ```
///
/// If the kernel generic space is closed, the `link!` macro can be made
/// private and all instantiations must be requested in the same crate that
/// defines the kernel function. If downstream code should be allowed to use
/// and compile new specific monomorphised instantiations of the kernel, the
/// `link!` macro should be publicly exported. Then, downstream code can define
/// its own `MyKernelPtx` marker types for which the kernel is linked and which
/// can be passed to [`rust_cuda::kernel::CompiledKernelPtx`]-generic code in
/// the kernel-defining crate to construct the requested
/// [`rust_cuda::kernel::TypedPtxKernel`].
///
/// Inside the scope of the [`#[kernel]`](macro@kernel) attribute, a helper
/// `#[kernel(...)]` attribute can be applied to the kernel function:
///
/// - `#[kernel(crate = "<crate-path>")]` changes the path to the [`rust-cuda`]
///   crate that the kernel compilation uses, which by default is `rust_cuda`.
/// - `#[kernel(allow/warn/deny/forbid(<lint>))]` checks the specified
///   CUDA-specific lint for each kernel compilation, using default Rust
///   semantics for allowing, warning on, denying, or forbidding a lint. The
///   following lints are supported:
///   - `ptx::double_precision_use`: check for any uses of [`f64`] operations
///     inside the compiled PTX binary, as they are often significantly less
///     performant on NVIDIA GPUs than [`f32`] operations. By default,
///     `#[kernel(warn(ptx::double_precision_use))]` is set.
///   - `ptx::local_memory_use`: check for any usage of local memory, which may
///     slow down kernel execution. By default,
///     `#[kernel(warn(ptx::local_memory_use))]` is set.
///   - `ptx::register_spills`: check for any spills of registers to local
///     memory. While using less registers can allow more kernels to be run in
///     parallel, register spills may also point to missed optimisations. By
///     default, `#[kernel(warn(ptx::register_spills))]` is set.
///   - `ptx::dynamic_stack_size`: check if the PTX compiler is unable to
///     statically determine the size of the required kernel function stack.
///     When the static stack size is known, the compiler may be able to keep it
///     entirely within the fast register file. However, when the stack size is
///     dynamic, more costly memory load and store operations are needed. By
///     default, `#[kernel(warn(ptx::dynamic_stack_size))]` is set.
///   - `ptx::verbose`: utility lint to output verbose PTX compiler messages as
///     warnings (`warn`) or errors (`deny` or `forbid`) or to not output them
///     (`allow`). By default, `#[kernel(allow(ptx::verbose))]` is set.
///   - `ptx::dump_assembly`: utility lint to output the compiled PTX assembly
///     code as a warning (`warn`) or an error (`deny` or `forbid`) or to not
///     output it (`allow`). By default, `#[kernel(allow(ptx::dump_assembly))]`
///     is set.
///
/// [`rust_cuda::kernel::TypedPtxKernel`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/struct.TypedPtxKernel.html
/// [`rust_cuda::kernel::CudaKernelParameter`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CudaKernelParameter.html
/// [`rust_cuda::kernel::CompiledKernelPtx`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CompiledKernelPtx.html
/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda
pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
    kernel::wrapper::kernel(attr, func)
}

#[doc(hidden)]
#[proc_macro_error]
#[proc_macro]
/// Helper macro to specialise the generic kernel param types when compiling
/// the specialised kernel for CUDA.
pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream {
    kernel::specialise::param_type::specialise_kernel_param_type(tokens)
}

#[doc(hidden)]
#[proc_macro_error]
#[proc_macro]
/// Helper macro to specialise the CUDA kernel entry point name, used on the
/// host for linking to it.
pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
    kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
}

#[doc(hidden)]
#[proc_macro_error]
#[proc_macro_attribute]
/// Helper macro to specialise the name of the CUDA kernel function item, used
/// to give each specialised version a unique ident when compiling for CUDA.
pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
    kernel::specialise::function::specialise_kernel_function(attr, func)
}

#[doc(hidden)]
#[proc_macro_error]
#[proc_macro]
/// Helper macro to cheaply check the generic CUDA kernel, used on the host to
/// provide code error feedback even when no specialised kernel is linked.
pub fn check_kernel(tokens: TokenStream) -> TokenStream {
    kernel::link::check_kernel(tokens)
}

#[doc(hidden)]
#[proc_macro_error]
#[proc_macro]
/// Helper macro to compile a specialised CUDA kernel and produce its PTX
/// assembly code, which is used on the host when linking specialised kernels.
pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
    kernel::link::compile_kernel(tokens)
}