developerskyebrowse Cursor commited on
Commit
e6de801
·
1 Parent(s): 52b092a

Migrate from TRELLIS v1 to TRELLIS.2 pipeline

Browse files

- Replace trellis/ with trellis2/ (3-stage: sparse structure -> shape -> texture)
- Model: microsoft/TRELLIS.2-4B (upgraded from TRELLIS-image-large)
- PyTorch 2.6.0 + CUDA 12.4, flash_attn_3, o_voxel, cumesh, flex_gemm
- GLB export via o_voxel.postprocess.to_glb with PBR materials
- Add HDRI envmaps for shaded rendering (forest, sunset, courtyard)
- Keep existing Gradio UI: LitModel3D, video preview, STL export
- Remove old extensions/, wheels/, Dockerfile (Gradio SDK space)

Co-authored-by: Cursor <cursoragent@cursor.com>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +9 -1
  3. app.py +261 -546
  4. wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl → assets/app/basecolor.png +2 -2
  5. assets/app/clay.png +3 -0
  6. assets/app/hdri_city.png +3 -0
  7. assets/app/hdri_courtyard.png +3 -0
  8. assets/app/hdri_forest.png +3 -0
  9. assets/app/hdri_interior.png +3 -0
  10. assets/app/hdri_night.png +3 -0
  11. assets/app/hdri_studio.png +3 -0
  12. assets/app/hdri_sunrise.png +3 -0
  13. assets/app/hdri_sunset.png +3 -0
  14. assets/app/normal.png +3 -0
  15. assets/hdri/city.exr +3 -0
  16. assets/hdri/courtyard.exr +3 -0
  17. assets/hdri/forest.exr +3 -0
  18. assets/hdri/interior.exr +3 -0
  19. assets/hdri/license.txt +15 -0
  20. assets/hdri/night.exr +3 -0
  21. assets/hdri/studio.exr +3 -0
  22. assets/hdri/sunrise.exr +3 -0
  23. assets/hdri/sunset.exr +3 -0
  24. autotune_cache.json +0 -0
  25. extensions/nvdiffrast/LICENSE.txt +0 -97
  26. extensions/nvdiffrast/README.md +0 -42
  27. extensions/nvdiffrast/nvdiffrast/__init__.py +0 -9
  28. extensions/nvdiffrast/nvdiffrast/common/antialias.cu +0 -558
  29. extensions/nvdiffrast/nvdiffrast/common/antialias.h +0 -50
  30. extensions/nvdiffrast/nvdiffrast/common/common.cpp +0 -60
  31. extensions/nvdiffrast/nvdiffrast/common/common.h +0 -263
  32. extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp +0 -63
  33. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl +0 -423
  34. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp +0 -94
  35. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp +0 -55
  36. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl +0 -730
  37. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp +0 -73
  38. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp +0 -79
  39. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp +0 -90
  40. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl +0 -385
  41. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp +0 -153
  42. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp +0 -370
  43. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp +0 -102
  44. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl_.cu +0 -37
  45. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl +0 -402
  46. extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl +0 -452
  47. extensions/nvdiffrast/nvdiffrast/common/framework.h +0 -49
  48. extensions/nvdiffrast/nvdiffrast/common/glutil.cpp +0 -403
  49. extensions/nvdiffrast/nvdiffrast/common/glutil.h +0 -113
  50. extensions/nvdiffrast/nvdiffrast/common/glutil_extlist.h +0 -48
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.exr filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,2 +1,10 @@
1
  model_cache/
2
- AGENTS.md
 
 
 
 
 
 
 
 
 
1
  model_cache/
2
+ AGENTS.md
3
+ __pycache__/
4
+ *.pyc
5
+ cache/
6
+ tmp/
7
+ trellis/
8
+ extensions/
9
+ wheels/
10
+ TRELLIS.2/
app.py CHANGED
@@ -1,502 +1,293 @@
1
- import sys
2
- import os
3
- print(f"[DIAG] app.py starting, __name__={__name__}, argv={sys.argv}", flush=True)
4
-
5
  import argparse
 
 
6
  import time
7
- print("[DIAG] importing gradio...", flush=True)
 
 
 
 
 
 
 
 
8
  import gradio as gr
9
- print(f"[DIAG] gradio imported, NO_RELOAD={gr.NO_RELOAD}", flush=True)
10
  import spaces
11
  from gradio_litmodel3d import LitModel3D
12
- print("[DIAG] all top-level UI imports done", flush=True)
13
  sys.path.append(os.getcwd())
14
- os.environ['ATTN_BACKEND'] = 'xformers'
15
- os.environ['SPCONV_ALGO'] = 'native'
16
- os.environ['TORCH_CUDA_ARCH_LIST'] = '8.9'
17
- import concurrent.futures
18
  from typing import *
19
- print("[DIAG] importing torch...", flush=True)
20
  import torch
21
- print(f"[DIAG] torch imported, cuda.is_available={torch.cuda.is_available()}", flush=True)
22
  import numpy as np
23
  import imageio
24
  from PIL import Image
25
  import trimesh
26
-
27
  from datetime import datetime
28
  import logging
29
 
 
 
 
 
 
 
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
  logger = logging.getLogger(__name__)
32
 
33
- # --- Command Line Args ---
34
- print("[DIAG] setting up argparse...", flush=True)
35
  parser = argparse.ArgumentParser(description="Pocket 3D AI 2")
36
- parser.add_argument("--prod",
37
- action="store_true",
38
- help="Run in production mode")
39
- parser.add_argument("--port",
40
- type=int,
41
- help="Port to run the server on (default: 8081 for prod, 8080 for dev)")
42
- cmd_args, unknown_args = parser.parse_known_args()
43
- if unknown_args:
44
- print(f"[DIAG] WARNING: unknown args ignored: {unknown_args}", flush=True)
45
- print(f"[DIAG] argparse done, prod={cmd_args.prod}, port={cmd_args.port}", flush=True)
46
 
47
  prod = cmd_args.prod
48
  port = cmd_args.port if cmd_args.port else (8081 if prod else 8080)
49
  show_options = not prod
50
 
51
  MAX_SEED = np.iinfo(np.int32).max
52
-
53
  TMP_DIR = os.path.join('cache')
54
  os.makedirs(TMP_DIR, exist_ok=True)
55
 
56
- print(f"[DIAG] entering gr.NO_RELOAD block (NO_RELOAD={gr.NO_RELOAD})...", flush=True)
57
  if gr.NO_RELOAD:
58
- print("[DIAG] importing trellis pipeline...", flush=True)
59
- from trellis.pipelines.trellis_image_to_3d import TrellisImageTo3DPipeline
60
- print("[DIAG] importing trellis utils...", flush=True)
61
- from trellis.utils import render_utils, postprocessing_utils
62
- print("[DIAG] trellis imports done", flush=True)
63
  pipeline = None
 
64
 
65
  def initialize_pipeline():
66
- global pipeline
67
  if pipeline is not None:
68
- logger.info("Pipeline already initialized.")
69
  return
70
 
71
- logger.info("Initializing pipeline...")
72
  start_time = time.time()
73
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
74
- target_dtype = torch.float16 if device.type == 'cuda' else torch.float32
75
- logger.info(f"Target device: {device}, Target dtype: {target_dtype}")
76
 
77
  try:
78
- logger.info("Loading models from pretrained...")
79
- pipeline = TrellisImageTo3DPipeline.from_pretrained("JeffreyXiang/TRELLIS-image-large")
80
-
81
- # Check where models landed
82
- sample_model = next(iter(pipeline.models.values()))
83
- current_device = next(sample_model.parameters()).device
84
- logger.info(f"Models initially on: {current_device}")
85
-
86
- # On ZeroGPU, no real GPU at init time. Ensure everything is on CPU.
87
- if current_device.type != 'cpu':
88
- logger.info("Moving pipeline to CPU for stable initialization...")
89
- pipeline.cpu()
90
-
91
- if hasattr(pipeline, 'rmbg_model') and pipeline.rmbg_model is not None:
92
- rmbg_device = next(pipeline.rmbg_model.parameters()).device
93
- if rmbg_device.type != 'cpu':
94
- logger.info(f"Moving RMBG model from {rmbg_device} to CPU...")
95
- pipeline.rmbg_model.to('cpu')
96
-
97
- logger.info(f"⏰ Pipeline initialized and confirmed on CPU in {time.time() - start_time:.2f} seconds.")
98
-
99
  except Exception as e:
100
  logger.error(f"Failed to initialize pipeline: {e}", exc_info=True)
101
- pipeline = None # Ensure pipeline is None if initialization failed
102
- raise # Re-raise the exception to stop the application if critical
103
-
104
 
105
  initialize_pipeline()
106
 
107
- # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def start_session(req: gr.Request):
110
- # user_dir = os.path.join(TMP_DIR, str(req.session_hash)) # Per-session dir if needed
111
  torch.cuda.empty_cache()
112
- user_dir = TMP_DIR
113
- os.makedirs(user_dir, exist_ok=True)
114
 
115
  def end_session(req: gr.Request):
116
  torch.cuda.empty_cache()
117
 
 
118
  def preprocess_image(image: Optional[Image.Image]) -> Optional[Image.Image]:
119
  if image is None:
120
- logger.warning("Preprocess: received None image.")
121
  return None
122
- user_dir = TMP_DIR
123
- current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
124
- image_path = os.path.join(user_dir, f'{current_time}.png')
125
- image.save(image_path)
126
  try:
127
- processed_image = pipeline.preprocess_image(image)
128
- return processed_image
129
  except Exception as e:
130
  logger.error(f"Error during image preprocessing: {e}", exc_info=True)
131
- return None # Return None or raise error
132
-
133
- def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
134
- images = [image[0] for image in images]
135
- processed_images = pipeline.preprocess_images(images)
136
- return processed_images
137
 
138
  def get_seed(randomize_seed: bool, seed: int) -> int:
139
- if randomize_seed:
140
- new_seed = np.random.randint(0, MAX_SEED)
141
- return new_seed
142
- else:
143
- return seed
144
 
145
- # --- Core Logic Functions ---
146
 
147
- def generate_3d_data(
148
- image: Image.Image,
 
149
  seed: int,
 
150
  ss_guidance_strength: float,
151
  ss_sampling_steps: int,
152
- slat_guidance_strength: float,
153
- slat_sampling_steps: int,
 
 
 
 
 
154
  progress=gr.Progress(track_tqdm=True)
155
- ) -> Optional[dict]:
156
  if image is None or pipeline is None:
157
- logger.error("Generate 3D Data: called with None image or uninitialized pipeline.")
158
- return None
159
 
160
- pipeline_start = time.time()
 
 
 
161
 
162
- try:
163
- outputs = pipeline.run(
164
- image,
165
- seed=seed,
166
- formats=["gaussian", "mesh"],
167
- preprocess_image=False,
168
- sparse_structure_sampler_params={
169
- "steps": ss_sampling_steps,
170
- "cfg_strength": ss_guidance_strength,
171
- },
172
- slat_sampler_params={
173
- "steps": slat_sampling_steps,
174
- "cfg_strength": slat_guidance_strength,
175
- },
176
- )
177
- logger.info(f"⌚ Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
178
- return outputs
179
- except Exception as e:
180
- logger.error(f"Error during pipeline run: {e}", exc_info=True)
181
- torch.cuda.empty_cache()
182
- return None
183
 
184
- def generate_3d_data_multi_image(
185
- images: List[Image.Image],
186
- seed: int,
187
- ss_guidance_strength: float,
188
- ss_sampling_steps: int,
189
- slat_guidance_strength: float,
190
- slat_sampling_steps: int,
191
- multiimage_algo: Literal["multidiffusion", "stochastic"],
192
- progress=gr.Progress(track_tqdm=True)
193
- ) -> Optional[dict]:
194
- if not images or pipeline is None:
195
- logger.error("Generate 3D Data Multi-Image: called with empty images or uninitialized pipeline.")
196
- return None
197
 
 
198
  pipeline_start = time.time()
199
 
200
  try:
201
- outputs = pipeline.run_multi_image(
202
- images,
203
  seed=seed,
204
- formats=["gaussian", "mesh"],
205
  preprocess_image=False,
206
  sparse_structure_sampler_params={
207
  "steps": ss_sampling_steps,
208
- "cfg_strength": ss_guidance_strength,
209
  },
210
- slat_sampler_params={
211
- "steps": slat_sampling_steps,
212
- "cfg_strength": slat_guidance_strength,
213
  },
214
- mode=multiimage_algo,
 
 
 
 
 
 
 
 
 
215
  )
216
- logger.info(f"⌚ Multi-Image Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
217
- return outputs
218
  except Exception as e:
219
- logger.error(f"Error during multi-image pipeline run: {e}", exc_info=True)
220
  torch.cuda.empty_cache()
221
- return None
222
 
 
223
 
224
- def combine_and_save_video(
225
- video_color_frames: List[np.ndarray],
226
- video_normal_frames: List[np.ndarray],
227
- is_mobile: bool,
228
- user_dir: str,
229
- progress=gr.Progress(track_tqdm=True) # Keep progress for potential future use or consistency
230
- ) -> Optional[str]:
231
- """
232
- Combines pre-rendered color and normal video frames and saves the result.
233
- """
234
- if not video_color_frames or not video_normal_frames:
235
- logger.error("Combine Video: received empty frame lists.")
236
- return None
237
 
238
- combine_start = time.time()
239
- logger.info("Starting video frame combination...") # Added log
 
 
 
240
 
241
  try:
242
- # Combine frames
243
- if is_mobile:
244
- # Vertical video (stacked top and bottom)
245
- combined_video = [np.concatenate([color_frame, normal_frame], axis=0) for color_frame, normal_frame in zip(video_color_frames, video_normal_frames)]
246
- else:
247
- # Horizontal video (side by side)
248
- combined_video = [np.concatenate([color_frame, normal_frame], axis=1) for color_frame, normal_frame in zip(video_color_frames, video_normal_frames)]
249
-
250
- # Save video
251
- current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
252
- video_path = os.path.join(user_dir, f'{current_time}.mp4')
253
- # Use a thread for saving to avoid blocking if I/O is slow
254
- # Note: imageio.mimsave might release GIL for some codecs/operations
255
- imageio.mimsave(video_path, combined_video, fps=15)
256
-
257
- return video_path
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  except Exception as e:
260
- logger.error(f"Error during video combination/saving: {e}", exc_info=True)
261
- return None
262
-
263
-
264
- def extract_glb(
265
- outputs: dict,
266
- mesh_simplify: float,
267
- texture_size: int,
268
- progress=gr.Progress(track_tqdm=True)
269
- ) -> Optional[Tuple[str, str]]: # MODIFIED return type
270
- """
271
- Extract a GLB file from the 3D model outputs and convert to STL.
272
- (Modified to return GLB and STL paths)
273
- """
274
- if outputs is None or 'gaussian' not in outputs or 'mesh' not in outputs:
275
- logger.error("Extract GLB: received invalid outputs.")
276
- return None, None # MODIFIED return
277
-
278
- glb_start_time = time.time() # Renamed for clarity
279
- user_dir = TMP_DIR
280
- glb_path: Optional[str] = None
281
- stl_path: Optional[str] = None
282
 
 
283
  try:
284
- glb_model_data = postprocessing_utils.to_glb(
285
- outputs['gaussian'][0],
286
- outputs['mesh'][0],
287
- simplify=mesh_simplify,
288
- fill_holes=False,
 
 
 
 
 
 
 
 
289
  texture_size=texture_size,
290
- verbose=False,
291
- debug=False
 
 
292
  )
 
293
  current_time_glb = datetime.now().strftime("%Y-%m%d-%H%M%S")
294
- glb_path = os.path.join(user_dir, f'{current_time_glb}.glb')
295
- glb_model_data.export(glb_path)
296
- logger.info(f"GLB Export Time: {time.time() - glb_start_time:.2f} seconds. Saved to {glb_path}")
297
 
298
- stl_export_start_time = time.time()
299
  try:
300
  mesh_data = trimesh.load_mesh(glb_path, force='mesh')
301
  mesh_to_export = None
302
 
303
  if isinstance(mesh_data, trimesh.Scene):
304
- if mesh_data.geometry:
305
- geometries = [g for g in mesh_data.geometry.values() if isinstance(g, trimesh.Trimesh)]
306
- if geometries:
307
- valid_geometries = [g for g in geometries if g.vertices is not None and len(g.vertices) > 0]
308
- if valid_geometries:
309
- combined_mesh = trimesh.util.concatenate(valid_geometries)
310
- if isinstance(combined_mesh, trimesh.Trimesh) and combined_mesh.vertices is not None and len(combined_mesh.vertices) > 0:
311
- mesh_to_export = combined_mesh
312
- else:
313
- logger.warning(f"Concatenated mesh from scene in {glb_path} is not a valid Trimesh or is empty.")
314
- else:
315
- logger.warning(f"No valid (non-empty) trimesh.Trimesh geometry found in scene for {glb_path}")
316
- else:
317
- logger.warning(f"No trimesh.Trimesh geometry found in scene for {glb_path}")
318
- else:
319
- logger.warning(f"Scene for {glb_path} is empty.")
320
- elif isinstance(mesh_data, trimesh.Trimesh):
321
- if mesh_data.vertices is not None and len(mesh_data.vertices) > 0:
322
- mesh_to_export = mesh_data
323
- else:
324
- logger.warning(f"Loaded Trimesh from {glb_path} is empty.")
325
- else:
326
- logger.warning(f"Loaded object from {glb_path} of type {type(mesh_data)} is not a Trimesh or Scene.")
327
-
328
- if mesh_to_export:
329
- if not (mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0):
330
- logger.warning(f"Mesh for STL export from {glb_path} has no faces. Attempting to convex hull.")
331
- if mesh_to_export.vertices is not None and len(mesh_to_export.vertices) >= 4:
332
- try:
333
- mesh_to_export = mesh_to_export.convex_hull
334
- except Exception as convex_e:
335
- logger.error(f"Failed to create convex hull for {glb_path}: {convex_e}")
336
- mesh_to_export = None
337
- else:
338
- mesh_to_export = None
339
-
340
- if mesh_to_export and mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0:
341
- current_time_stl = datetime.now().strftime("%Y-%m%d-%H%M%S-%f")
342
- stl_path = os.path.join(user_dir, f'{current_time_stl}.stl')
343
- if mesh_to_export:
344
- mesh_to_export = mesh_to_export.copy()
345
- # Y-up to Z-up (upright, not upside down)
346
- rot_x_90 = trimesh.transformations.rotation_matrix(np.deg2rad(90), [1, 0, 0])
347
- mesh_to_export.apply_transform(rot_x_90)
348
- bbox = mesh_to_export.bounds
349
- current_size = (bbox[1] - bbox[0]).max()
350
- target_size_mm = 152.4 # 6 inches
351
- if current_size > 0:
352
- scale_factor = target_size_mm / current_size
353
- mesh_to_export.vertices *= scale_factor
354
- mesh_to_export.export(stl_path)
355
- logger.info(f"⌚ STL Export Time: {time.time() - stl_export_start_time:.2f} seconds. Saved to {stl_path}")
356
- elif mesh_to_export:
357
- logger.error(f"Failed to prepare mesh with faces from {glb_path} for STL export.")
358
- stl_path = None
359
- else:
360
- logger.error(f"No valid mesh could be processed from {glb_path} for STL export.")
361
- stl_path = None
362
-
363
  except Exception as stl_e:
364
- logger.error(f"Error during STL processing for {glb_path}: {stl_e}", exc_info=True)
365
- stl_path = None
366
-
367
- return glb_path, stl_path
368
-
369
- except Exception as e:
370
- logger.error(f"Error during GLB/STL extraction: {e}", exc_info=True)
371
- return None, None
372
-
373
-
374
- @spaces.GPU(duration=120)
375
- def process_image_concurrently_yielding(
376
- image: Optional[Image.Image],
377
- multiimages: List[Tuple[Image.Image, str]],
378
- is_multiimage: bool,
379
- seed: int,
380
- ss_guidance_strength: float,
381
- ss_sampling_steps: int,
382
- slat_guidance_strength: float,
383
- slat_sampling_steps: int,
384
- mesh_simplify: float,
385
- texture_size: int,
386
- multiimage_algo: Literal["multidiffusion", "stochastic"],
387
- req: gr.Request,
388
- progress=gr.Progress(track_tqdm=True)
389
- ) -> Generator[Tuple[Optional[str], Optional[str], Dict[str, Any]], None, None]:
390
- video_path: Optional[str] = None
391
- glb_path: Optional[str] = None
392
- stl_path: Optional[str] = None
393
- color_frames: Optional[List[np.ndarray]] = None
394
- normal_frames: Optional[List[np.ndarray]] = None
395
-
396
- # Move all models to real GPU now that @spaces.GPU has allocated one
397
- logger.info("Moving pipeline models to GPU...")
398
- move_start = time.time()
399
- pipeline.cuda()
400
- if hasattr(pipeline, 'rmbg_model') and pipeline.rmbg_model is not None:
401
- pipeline.rmbg_model.to('cuda')
402
- logger.info(f"Models moved to GPU in {time.time() - move_start:.2f}s")
403
 
404
- yield None, None, gr.update(value=None, interactive=False)
 
405
 
406
- # --- Step 0: Preprocess image(s) inside the GPU session ---
407
- if not is_multiimage:
408
- if image is None:
409
- logger.error("Received None image.")
410
- return
411
- logger.info("Preprocessing single image inside GPU session...")
412
- progress(0, desc="Removing background...")
413
- image = preprocess_image(image)
414
- if image is None:
415
- logger.error("Image preprocessing failed.")
416
- return
417
- logger.info("Starting Pipeline (Single Image)...")
418
- outputs = generate_3d_data(
419
- image, seed, ss_guidance_strength, ss_sampling_steps,
420
- slat_guidance_strength, slat_sampling_steps, progress=progress
421
- )
422
- else:
423
- if not multiimages:
424
- logger.error("Received empty multiimages list.")
425
- return
426
- logger.info(f"Preprocessing and starting Pipeline (Multi-Image: {len(multiimages)} images)...")
427
- progress(0, desc="Preprocessing images...")
428
- processed_images = preprocess_images(multiimages)
429
- progress(0.1, desc="Generating 3D Structure...")
430
- outputs = generate_3d_data_multi_image(
431
- processed_images, seed, ss_guidance_strength, ss_sampling_steps,
432
- slat_guidance_strength, slat_sampling_steps, multiimage_algo, progress=progress
433
- )
434
-
435
- if outputs is None:
436
- logger.error("Failed to generate 3D data. Aborting.")
437
- return
438
 
439
- # --- Step 2: Determine Render Settings ---
440
- user_agent = req.headers.get("User-Agent", "").lower()
441
- is_mobile = any(device in user_agent for device in ["android", "iphone", "ipad", "mobile"])
442
- resolution = 256 if is_mobile else 384
443
- num_frames = 45 # Consistent frame count
444
-
445
- # --- Step 3: Render Videos First ---
446
- vid_time = time.time()
447
-
448
- progress(0.4, desc="Rendering Preview Videos...")
449
- logger.info("Rendering videos: color and normal")
450
-
451
- try:
452
- color_result = render_utils.render_video(outputs['gaussian'][0], resolution=resolution, num_frames=num_frames, mode='color', verbose=False)
453
- normal_result = render_utils.render_video(outputs['mesh'][0], resolution=resolution, num_frames=num_frames, mode='normal', verbose=False)
454
-
455
- if color_result and 'color' in color_result:
456
- color_frames = color_result['color']
457
- else:
458
- logger.warning("Color video rendering returned invalid data.")
459
- color_frames = []
460
-
461
- if normal_result and 'normal' in normal_result:
462
- normal_frames = normal_result['normal']
463
- else:
464
- logger.warning("Normal video rendering returned invalid data.")
465
- normal_frames = []
466
-
467
- except Exception as exc:
468
- logger.error(f"Video rendering generated an exception: {exc}", exc_info=True)
469
- color_frames = []
470
- normal_frames = []
471
-
472
- if color_frames and normal_frames:
473
- video_path = combine_and_save_video(color_frames, normal_frames, is_mobile, TMP_DIR, progress=progress)
474
- if video_path:
475
- logger.info(f"✅ Video Time: {time.time() - vid_time:.2f} seconds")
476
- yield video_path, None, gr.update(value=None, interactive=False)
477
- else:
478
- logger.warning("Video combination/saving failed.")
479
-
480
- # --- Step 4: Extract GLB/STL After Video ---
481
- try:
482
- progress(0.7, desc="Finalizing 3D Model & Textures...")
483
- glb_stl_result = extract_glb(outputs, mesh_simplify, texture_size, progress=progress)
484
- if glb_stl_result and isinstance(glb_stl_result, tuple) and len(glb_stl_result) == 2:
485
- glb_path, stl_path = glb_stl_result
486
- if not glb_path:
487
- logger.warning("GLB extraction returned None.")
488
- if not stl_path:
489
- logger.warning("STL extraction returned None.")
490
- else:
491
- logger.warning(f"GLB/STL extraction returned invalid data: {glb_stl_result}")
492
- glb_path, stl_path = None, None
493
- except Exception as exc:
494
- logger.error(f"GLB/STL extraction generated an exception: {exc}", exc_info=True)
495
- glb_path, stl_path = None, None
496
-
497
- stl_button_update = gr.update(value=stl_path, interactive=True) if stl_path else gr.update(value=None, interactive=False)
498
- yield video_path, glb_path, stl_button_update
499
-
500
  torch.cuda.empty_cache()
501
 
502
 
@@ -504,57 +295,50 @@ css = """
504
  h1, h2, h3 { text-align: center; display: block; }
505
  footer { visibility: hidden; }
506
  .gradio-container { max-width: 1024px !important; }
507
- /* Base styles */
508
  .gr-image-container { display: flex !important; justify-content: center !important; align-items: center !important; width: 100%; height: 240px; }
509
  .gr-image-container img { width: 100%; height: 100%; object-fit: contain; object-position: center; }
510
- /* Desktop styles */
511
  @media screen and (min-width: 768px) {
512
  .gr-image-container { height: 360px !important; }
513
- .video-container { height: 360px !important; max-width: 680px !important; margin: 0 auto !important; aspect-ratio: auto !important; /* Adjusted aspect-ratio */ }
514
  .model-container { height: 480px !important; max-width: 680px !important; margin: 0 auto !important; }
515
  }
516
  .custom-header { display: flex; align-items: center; height: 100%; }
517
  """
518
 
519
  with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
 
 
520
  with gr.Row(equal_height=True):
521
  gr.Image("assets/sb_pocket_logo_dark.png", show_label=False, container=False, show_download_button=False, min_width=50, interactive=False, show_fullscreen_button=False)
522
 
523
  with gr.Column():
524
  with gr.Row():
525
  with gr.Column(scale=2, min_width=100, variant="default"):
526
- with gr.Tabs() as input_tabs:
527
- with gr.Tab(label="Single Image", id=0) as single_image_input_tab:
528
- image_prompt = gr.Image(label="Input",
529
- format="png",
530
- image_mode="RGBA",
531
- type="pil",
532
- sources=['upload', 'clipboard'],
533
- container=True,
534
- mirror_webcam=True,
535
- visible=True,
536
- height=240,
537
- elem_classes="gr-image-container",
538
- )
539
- with gr.Tab(label="Multiple Images", id=1) as multiimage_input_tab:
540
- multiimage_prompt = gr.Gallery(label="Images", format="png", type="pil", height=240, columns=3)
541
- gr.Markdown("""
542
- Input different views of the object in separate images.
543
-
544
- *NOTE: this is experimental and may not produce the best results for all images.*
545
- """)
546
- multi_image_process_Button = gr.Button(value="Process Images", visible=True, interactive=True, size="lg", variant="primary")
547
-
548
 
549
  with gr.Column(scale=5, min_width=100):
550
- video_output = gr.Video(label=" ",
551
- height=240, # Let CSS handle height
552
- elem_classes="video-container",
553
- visible=False,
554
- autoplay=True,
555
- loop=True,
556
- show_download_button=True,
557
- interactive=False)
 
 
 
558
  with gr.Row(equal_height=False):
559
  with gr.Column(scale=2, min_width=100, variant="default"):
560
  examples = gr.Examples(
@@ -562,50 +346,56 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
562
  f'./assets/example_image/{image}'
563
  for image in os.listdir("./assets/example_image")
564
  ],
565
- inputs=[image_prompt],
566
  examples_per_page=9,
567
  )
568
  with gr.Column(scale=5):
569
- model_output = LitModel3D(label="",
570
- container=True,
571
- zoom_speed=0.5,
572
- pan_speed=3.0,
573
- exposure=10.0, # Adjusted default exposure
574
- height=360, # Let CSS handle height
575
- elem_classes="model-container",
576
- visible=False)
 
 
577
  stl_download_button = gr.DownloadButton(label="Download STL", visible=False, interactive=False, size="lg", variant="primary")
578
 
579
  with gr.Accordion(label="Generation Settings", open=False, visible=not prod):
580
  seed_slider = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
581
  randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
 
582
  gr.Markdown("Stage 1: Sparse Structure Generation")
583
  with gr.Row():
584
- ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
585
  ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
586
- gr.Markdown("Stage 2: Structured Latent Generation")
587
  with gr.Row():
588
- slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=1.5, step=0.1)
589
- slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=6, step=1)
590
- multiimage_algo = gr.Radio(["stochastic", "multidiffusion"], label="Multi-image Algorithm", value="stochastic")
 
 
 
591
 
592
  with gr.Accordion(label="GLB Extraction Settings", open=False, visible=not prod):
593
- mesh_simplify = gr.Slider(0, 0.98, label="Simplify", value=0.95, step=0.01)
594
- texture_size = gr.Slider(512, 4096, label="Texture Size", value=1024, step=512)
595
-
596
- is_multiimage = gr.State(False)
597
 
598
  demo.load(start_session)
599
  demo.unload(end_session)
600
 
601
- single_image_input_tab.select(
602
- lambda: False,
603
- outputs=[is_multiimage]
604
- )
605
- multiimage_input_tab.select(
606
- lambda: True,
607
- outputs=[is_multiimage]
608
- )
 
 
609
 
610
  image_prompt.upload(
611
  get_seed,
@@ -614,74 +404,13 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
614
  show_progress="hidden",
615
  trigger_mode="always_last"
616
  ).then(
617
- fn=process_image_concurrently_yielding,
618
- inputs=[
619
- image_prompt,
620
- multiimage_prompt,
621
- is_multiimage,
622
- seed_slider,
623
- ss_guidance_strength, ss_sampling_steps,
624
- slat_guidance_strength, slat_sampling_steps,
625
- mesh_simplify, texture_size,
626
- multiimage_algo
627
- ],
628
- outputs=[video_output, model_output, stl_download_button],
629
  show_progress="hidden",
630
  scroll_to_output=True,
631
  )
632
 
633
- multi_image_process_Button.click(
634
- get_seed,
635
- inputs=[randomize_seed, seed_slider],
636
- outputs=[seed_slider],
637
- show_progress="hidden",
638
- trigger_mode="always_last"
639
- ).then(
640
- fn=process_image_concurrently_yielding,
641
- inputs=[
642
- image_prompt,
643
- multiimage_prompt,
644
- is_multiimage,
645
- seed_slider,
646
- ss_guidance_strength, ss_sampling_steps,
647
- slat_guidance_strength, slat_sampling_steps,
648
- mesh_simplify, texture_size,
649
- multiimage_algo
650
- ],
651
- outputs=[video_output, model_output, stl_download_button],
652
- show_progress="hidden",
653
- scroll_to_output=True,
654
- )
655
-
656
- # multiimage_prompt.upload(
657
- # preprocess_images,
658
- # inputs=[multiimage_prompt],
659
- # outputs=[multiimage_prompt],
660
- # show_progress="minimal",
661
- # ).then(
662
- # get_seed,
663
- # inputs=[randomize_seed, seed_slider],
664
- # outputs=[seed_slider],
665
- # show_progress="hidden",
666
- # trigger_mode="always_last"
667
- # ).then(
668
- # fn=process_image_concurrently_yielding,
669
- # inputs=[
670
- # image_prompt,
671
- # multiimage_prompt,
672
- # is_multiimage,
673
- # seed_slider,
674
- # ss_guidance_strength, ss_sampling_steps,
675
- # slat_guidance_strength, slat_sampling_steps,
676
- # mesh_simplify, texture_size,
677
- # multiimage_algo
678
- # ],
679
- # outputs=[video_output, model_output, stl_download_button],
680
- # show_progress="minimal",
681
- # scroll_to_output=True,
682
- # )
683
-
684
-
685
  examples.dataset.select(
686
  fn=get_seed,
687
  inputs=[randomize_seed, seed_slider],
@@ -689,51 +418,37 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
689
  show_progress="hidden",
690
  trigger_mode="always_last",
691
  ).then(
692
- fn=process_image_concurrently_yielding,
693
- inputs=[
694
- image_prompt,
695
- multiimage_prompt,
696
- is_multiimage,
697
- seed_slider,
698
- ss_guidance_strength, ss_sampling_steps,
699
- slat_guidance_strength, slat_sampling_steps,
700
- mesh_simplify, texture_size,
701
- multiimage_algo
702
- ],
703
- outputs=[video_output, model_output, stl_download_button],
704
  show_progress="hidden",
705
  scroll_to_output=True,
706
  )
707
 
708
-
709
- # --- UI Toggling based on Outputs ---
710
- @gr.on(triggers=[image_prompt.change], inputs=None, outputs=[video_output, model_output, stl_download_button], show_progress="minimal") # MODIFIED outputs
711
- def toggle_outputs_on_new_image(): # RENAMED and MODIFIED
712
  return (
713
  gr.update(visible=True, value=None),
714
- gr.update(visible=False, value=None),
715
  gr.update(visible=False, value=None, interactive=False)
716
  )
717
-
718
- @gr.on(triggers=[video_output.change], inputs=None, outputs=[model_output, stl_download_button]) # MODIFIED outputs
719
- def toggle_model_and_stl_button_visibility(): # RENAMED and MODIFIED
720
  return (
721
- gr.update(label="Interactive 3D Model", visible=True),
722
  gr.update(visible=True)
723
  )
724
-
725
  @gr.on(triggers=[video_output.change], inputs=None, outputs=video_output, show_progress="hidden")
726
  def toggle_label():
727
  return gr.update(label="Double Tap To Play", visible=True)
728
 
729
 
730
- print(f"[DIAG] reached launch section, __name__={__name__}", flush=True)
731
  if __name__ == "__main__":
732
- print("[DIAG] inside __main__ block", flush=True)
733
-
734
  if pipeline is None:
735
  logger.critical("Pipeline failed to initialize. Exiting.")
736
- sys.exit(1) # Exit if pipeline isn't ready
737
 
738
  running_on_spaces = bool(os.getenv("SPACE_ID"))
739
 
@@ -764,4 +479,4 @@ if __name__ == "__main__":
764
  debug=True,
765
  share=True,
766
  allowed_paths=["./cache", "./assets"]
767
- )
 
 
 
 
 
1
  import argparse
2
+ import os
3
+ import sys
4
  import time
5
+ import io
6
+ import base64
7
+
8
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = '1'
9
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
10
+ os.environ["ATTN_BACKEND"] = "flash_attn_3"
11
+ os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'autotune_cache.json')
12
+ os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = '1'
13
+
14
  import gradio as gr
 
15
  import spaces
16
  from gradio_litmodel3d import LitModel3D
 
17
  sys.path.append(os.getcwd())
18
+ import cv2
 
 
 
19
  from typing import *
 
20
  import torch
 
21
  import numpy as np
22
  import imageio
23
  from PIL import Image
24
  import trimesh
 
25
  from datetime import datetime
26
  import logging
27
 
28
+ from trellis2.modules.sparse import SparseTensor
29
+ from trellis2.pipelines import Trellis2ImageTo3DPipeline
30
+ from trellis2.renderers import EnvMap
31
+ from trellis2.utils import render_utils
32
+ import o_voxel
33
+
34
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
35
  logger = logging.getLogger(__name__)
36
 
 
 
37
  parser = argparse.ArgumentParser(description="Pocket 3D AI 2")
38
+ parser.add_argument("--prod", action="store_true", help="Run in production mode")
39
+ parser.add_argument("--port", type=int, help="Port to run the server on (default: 8081 for prod, 8080 for dev)")
40
+ cmd_args, _unknown_args = parser.parse_known_args()
 
 
 
 
 
 
 
41
 
42
  prod = cmd_args.prod
43
  port = cmd_args.port if cmd_args.port else (8081 if prod else 8080)
44
  show_options = not prod
45
 
46
  MAX_SEED = np.iinfo(np.int32).max
 
47
  TMP_DIR = os.path.join('cache')
48
  os.makedirs(TMP_DIR, exist_ok=True)
49
 
 
50
  if gr.NO_RELOAD:
 
 
 
 
 
51
  pipeline = None
52
+ envmap = None
53
 
54
  def initialize_pipeline():
55
+ global pipeline, envmap
56
  if pipeline is not None:
 
57
  return
58
 
59
+ logger.info("Initializing TRELLIS.2 pipeline...")
60
  start_time = time.time()
 
 
 
61
 
62
  try:
63
+ pipeline = Trellis2ImageTo3DPipeline.from_pretrained('microsoft/TRELLIS.2-4B')
64
+ pipeline.rembg_model = None
65
+ pipeline.low_vram = False
66
+ pipeline._device = 'cpu'
67
+
68
+ envmap = {}
69
+ for name in ['forest', 'sunset', 'courtyard']:
70
+ exr_path = os.path.join('assets', 'hdri', f'{name}.exr')
71
+ if os.path.exists(exr_path):
72
+ envmap[name] = cv2.cvtColor(
73
+ cv2.imread(exr_path, cv2.IMREAD_UNCHANGED),
74
+ cv2.COLOR_BGR2RGB
75
+ )
76
+
77
+ logger.info(f"Pipeline initialized in {time.time() - start_time:.2f} seconds.")
 
 
 
 
 
 
78
  except Exception as e:
79
  logger.error(f"Failed to initialize pipeline: {e}", exc_info=True)
80
+ pipeline = None
81
+ raise
 
82
 
83
  initialize_pipeline()
84
 
85
+
86
+ def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
87
+ shape_slat, tex_slat, res = latents
88
+ return {
89
+ 'shape_slat_feats': shape_slat.feats.cpu().numpy(),
90
+ 'tex_slat_feats': tex_slat.feats.cpu().numpy(),
91
+ 'coords': shape_slat.coords.cpu().numpy(),
92
+ 'res': res,
93
+ }
94
+
95
+
96
+ def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
97
+ shape_slat = SparseTensor(
98
+ feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
99
+ coords=torch.from_numpy(state['coords']).cuda(),
100
+ )
101
+ tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
102
+ return shape_slat, tex_slat, state['res']
103
+
104
 
105
  def start_session(req: gr.Request):
 
106
  torch.cuda.empty_cache()
107
+ os.makedirs(TMP_DIR, exist_ok=True)
108
+
109
 
110
  def end_session(req: gr.Request):
111
  torch.cuda.empty_cache()
112
 
113
+
114
  def preprocess_image(image: Optional[Image.Image]) -> Optional[Image.Image]:
115
  if image is None:
 
116
  return None
 
 
 
 
117
  try:
118
+ return pipeline.preprocess_image(image)
 
119
  except Exception as e:
120
  logger.error(f"Error during image preprocessing: {e}", exc_info=True)
121
+ return None
122
+
 
 
 
 
123
 
124
  def get_seed(randomize_seed: bool, seed: int) -> int:
125
+ return np.random.randint(0, MAX_SEED) if randomize_seed else seed
 
 
 
 
126
 
 
127
 
128
+ @spaces.GPU(duration=120)
129
+ def process_image_yielding(
130
+ image: Optional[Image.Image],
131
  seed: int,
132
+ resolution: str,
133
  ss_guidance_strength: float,
134
  ss_sampling_steps: int,
135
+ shape_guidance_strength: float,
136
+ shape_sampling_steps: int,
137
+ tex_guidance_strength: float,
138
+ tex_sampling_steps: int,
139
+ mesh_simplify: int,
140
+ texture_size: int,
141
+ req: gr.Request,
142
  progress=gr.Progress(track_tqdm=True)
143
+ ) -> Generator:
144
  if image is None or pipeline is None:
145
+ return
 
146
 
147
+ pipeline.cuda()
148
+ loaded_envmap = {}
149
+ for name, exr_data in envmap.items():
150
+ loaded_envmap[name] = EnvMap(torch.tensor(exr_data, dtype=torch.float32, device='cuda'))
151
 
152
+ yield None, None, gr.update(value=None, interactive=False), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ progress(0, desc="Removing background...")
155
+ image = preprocess_image(image)
156
+ if image is None:
157
+ return
 
 
 
 
 
 
 
 
 
158
 
159
+ progress(0.1, desc="Generating 3D structure...")
160
  pipeline_start = time.time()
161
 
162
  try:
163
+ outputs, latents = pipeline.run(
164
+ image,
165
  seed=seed,
 
166
  preprocess_image=False,
167
  sparse_structure_sampler_params={
168
  "steps": ss_sampling_steps,
169
+ "guidance_strength": ss_guidance_strength,
170
  },
171
+ shape_slat_sampler_params={
172
+ "steps": shape_sampling_steps,
173
+ "guidance_strength": shape_guidance_strength,
174
  },
175
+ tex_slat_sampler_params={
176
+ "steps": tex_sampling_steps,
177
+ "guidance_strength": tex_guidance_strength,
178
+ },
179
+ pipeline_type={
180
+ "512": "512",
181
+ "1024": "1024_cascade",
182
+ "1536": "1536_cascade",
183
+ }[resolution],
184
+ return_latent=True,
185
  )
 
 
186
  except Exception as e:
187
+ logger.error(f"Error during pipeline run: {e}", exc_info=True)
188
  torch.cuda.empty_cache()
189
+ return
190
 
191
+ logger.info(f"Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
192
 
193
+ mesh = outputs[0]
194
+ mesh.simplify(16777216)
195
+ state = pack_state(latents)
 
 
 
 
 
 
 
 
 
 
196
 
197
+ progress(0.5, desc="Rendering preview video...")
198
+ user_agent = req.headers.get("User-Agent", "").lower()
199
+ is_mobile = any(d in user_agent for d in ["android", "iphone", "ipad", "mobile"])
200
+ vid_resolution = 256 if is_mobile else 384
201
+ num_frames = 45
202
 
203
  try:
204
+ vid_result = render_utils.render_video(mesh, resolution=vid_resolution, num_frames=num_frames, r=2, fov=36, envmap=loaded_envmap)
205
+ color_frames = vid_result.get('shaded', vid_result.get('color', []))
206
+ normal_frames = vid_result.get('normal', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ if color_frames and normal_frames:
209
+ if is_mobile:
210
+ combined = [np.concatenate([c, n], axis=0) for c, n in zip(color_frames, normal_frames)]
211
+ else:
212
+ combined = [np.concatenate([c, n], axis=1) for c, n in zip(color_frames, normal_frames)]
213
+
214
+ current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
215
+ video_path = os.path.join(TMP_DIR, f'{current_time}.mp4')
216
+ imageio.mimsave(video_path, combined, fps=15)
217
+ logger.info(f"Video rendered: {video_path}")
218
+ yield video_path, None, gr.update(value=None, interactive=False), state
219
+ elif color_frames:
220
+ current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
221
+ video_path = os.path.join(TMP_DIR, f'{current_time}.mp4')
222
+ imageio.mimsave(video_path, color_frames, fps=15)
223
+ yield video_path, None, gr.update(value=None, interactive=False), state
224
  except Exception as e:
225
+ logger.error(f"Video rendering error: {e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ progress(0.7, desc="Extracting GLB model...")
228
  try:
229
+ shape_slat, tex_slat, res = unpack_state(state)
230
+ decoded_mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]
231
+ decoded_mesh.simplify(16777216)
232
+
233
+ glb = o_voxel.postprocess.to_glb(
234
+ vertices=decoded_mesh.vertices,
235
+ faces=decoded_mesh.faces,
236
+ attr_volume=decoded_mesh.attrs,
237
+ coords=decoded_mesh.coords,
238
+ attr_layout=pipeline.pbr_attr_layout,
239
+ grid_size=res,
240
+ aabb=[[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]],
241
+ decimation_target=mesh_simplify,
242
  texture_size=texture_size,
243
+ remesh=True,
244
+ remesh_band=1,
245
+ remesh_project=0,
246
+ use_tqdm=True,
247
  )
248
+
249
  current_time_glb = datetime.now().strftime("%Y-%m%d-%H%M%S")
250
+ glb_path = os.path.join(TMP_DIR, f'{current_time_glb}.glb')
251
+ glb.export(glb_path, extension_webp=True)
252
+ logger.info(f"GLB exported: {glb_path}")
253
 
254
+ stl_path = None
255
  try:
256
  mesh_data = trimesh.load_mesh(glb_path, force='mesh')
257
  mesh_to_export = None
258
 
259
  if isinstance(mesh_data, trimesh.Scene):
260
+ geometries = [g for g in mesh_data.geometry.values() if isinstance(g, trimesh.Trimesh)]
261
+ valid = [g for g in geometries if g.vertices is not None and len(g.vertices) > 0]
262
+ if valid:
263
+ combined_mesh = trimesh.util.concatenate(valid)
264
+ if isinstance(combined_mesh, trimesh.Trimesh) and len(combined_mesh.vertices) > 0:
265
+ mesh_to_export = combined_mesh
266
+ elif isinstance(mesh_data, trimesh.Trimesh) and len(mesh_data.vertices) > 0:
267
+ mesh_to_export = mesh_data
268
+
269
+ if mesh_to_export and mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0:
270
+ mesh_to_export = mesh_to_export.copy()
271
+ rot_x_90 = trimesh.transformations.rotation_matrix(np.deg2rad(90), [1, 0, 0])
272
+ mesh_to_export.apply_transform(rot_x_90)
273
+ bbox = mesh_to_export.bounds
274
+ current_size = (bbox[1] - bbox[0]).max()
275
+ target_size_mm = 152.4
276
+ if current_size > 0:
277
+ mesh_to_export.vertices *= target_size_mm / current_size
278
+ current_time_stl = datetime.now().strftime("%Y-%m%d-%H%M%S-%f")
279
+ stl_path = os.path.join(TMP_DIR, f'{current_time_stl}.stl')
280
+ mesh_to_export.export(stl_path)
281
+ logger.info(f"STL exported: {stl_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  except Exception as stl_e:
283
+ logger.error(f"STL export error: {stl_e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
+ stl_update = gr.update(value=stl_path, interactive=True) if stl_path else gr.update(value=None, interactive=False)
286
+ yield video_path, glb_path, stl_update, state
287
 
288
+ except Exception as e:
289
+ logger.error(f"GLB extraction error: {e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  torch.cuda.empty_cache()
292
 
293
 
 
295
  h1, h2, h3 { text-align: center; display: block; }
296
  footer { visibility: hidden; }
297
  .gradio-container { max-width: 1024px !important; }
 
298
  .gr-image-container { display: flex !important; justify-content: center !important; align-items: center !important; width: 100%; height: 240px; }
299
  .gr-image-container img { width: 100%; height: 100%; object-fit: contain; object-position: center; }
 
300
  @media screen and (min-width: 768px) {
301
  .gr-image-container { height: 360px !important; }
302
+ .video-container { height: 360px !important; max-width: 680px !important; margin: 0 auto !important; aspect-ratio: auto !important; }
303
  .model-container { height: 480px !important; max-width: 680px !important; margin: 0 auto !important; }
304
  }
305
  .custom-header { display: flex; align-items: center; height: 100%; }
306
  """
307
 
308
  with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
309
+ output_state = gr.State()
310
+
311
  with gr.Row(equal_height=True):
312
  gr.Image("assets/sb_pocket_logo_dark.png", show_label=False, container=False, show_download_button=False, min_width=50, interactive=False, show_fullscreen_button=False)
313
 
314
  with gr.Column():
315
  with gr.Row():
316
  with gr.Column(scale=2, min_width=100, variant="default"):
317
+ image_prompt = gr.Image(
318
+ label="Input",
319
+ format="png",
320
+ image_mode="RGBA",
321
+ type="pil",
322
+ sources=['upload', 'clipboard'],
323
+ container=True,
324
+ mirror_webcam=True,
325
+ visible=True,
326
+ height=240,
327
+ elem_classes="gr-image-container",
328
+ )
 
 
 
 
 
 
 
 
 
 
329
 
330
  with gr.Column(scale=5, min_width=100):
331
+ video_output = gr.Video(
332
+ label=" ",
333
+ height=240,
334
+ elem_classes="video-container",
335
+ visible=False,
336
+ autoplay=True,
337
+ loop=True,
338
+ show_download_button=True,
339
+ interactive=False,
340
+ )
341
+
342
  with gr.Row(equal_height=False):
343
  with gr.Column(scale=2, min_width=100, variant="default"):
344
  examples = gr.Examples(
 
346
  f'./assets/example_image/{image}'
347
  for image in os.listdir("./assets/example_image")
348
  ],
349
+ inputs=[image_prompt],
350
  examples_per_page=9,
351
  )
352
  with gr.Column(scale=5):
353
+ model_output = LitModel3D(
354
+ label="",
355
+ container=True,
356
+ zoom_speed=0.5,
357
+ pan_speed=3.0,
358
+ exposure=10.0,
359
+ height=360,
360
+ elem_classes="model-container",
361
+ visible=False,
362
+ )
363
  stl_download_button = gr.DownloadButton(label="Download STL", visible=False, interactive=False, size="lg", variant="primary")
364
 
365
  with gr.Accordion(label="Generation Settings", open=False, visible=not prod):
366
  seed_slider = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
367
  randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
368
+ resolution_radio = gr.Radio(["512", "1024", "1536"], label="Resolution", value="1024")
369
  gr.Markdown("Stage 1: Sparse Structure Generation")
370
  with gr.Row():
371
+ ss_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
372
  ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
373
+ gr.Markdown("Stage 2: Shape Generation")
374
  with gr.Row():
375
+ shape_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
376
+ shape_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
377
+ gr.Markdown("Stage 3: Texture Generation")
378
+ with gr.Row():
379
+ tex_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=1.0, step=0.1)
380
+ tex_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
381
 
382
  with gr.Accordion(label="GLB Extraction Settings", open=False, visible=not prod):
383
+ mesh_simplify = gr.Slider(100000, 500000, label="Decimation Target", value=300000, step=10000)
384
+ texture_size = gr.Slider(1024, 4096, label="Texture Size", value=2048, step=1024)
 
 
385
 
386
  demo.load(start_session)
387
  demo.unload(end_session)
388
 
389
+ generation_inputs = [
390
+ image_prompt,
391
+ seed_slider,
392
+ resolution_radio,
393
+ ss_guidance_strength, ss_sampling_steps,
394
+ shape_guidance_strength, shape_sampling_steps,
395
+ tex_guidance_strength, tex_sampling_steps,
396
+ mesh_simplify, texture_size,
397
+ ]
398
+ generation_outputs = [video_output, model_output, stl_download_button, output_state]
399
 
400
  image_prompt.upload(
401
  get_seed,
 
404
  show_progress="hidden",
405
  trigger_mode="always_last"
406
  ).then(
407
+ fn=process_image_yielding,
408
+ inputs=generation_inputs,
409
+ outputs=generation_outputs,
 
 
 
 
 
 
 
 
 
410
  show_progress="hidden",
411
  scroll_to_output=True,
412
  )
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  examples.dataset.select(
415
  fn=get_seed,
416
  inputs=[randomize_seed, seed_slider],
 
418
  show_progress="hidden",
419
  trigger_mode="always_last",
420
  ).then(
421
+ fn=process_image_yielding,
422
+ inputs=generation_inputs,
423
+ outputs=generation_outputs,
 
 
 
 
 
 
 
 
 
424
  show_progress="hidden",
425
  scroll_to_output=True,
426
  )
427
 
428
+ @gr.on(triggers=[image_prompt.change], inputs=None, outputs=[video_output, model_output, stl_download_button], show_progress="minimal")
429
+ def toggle_outputs_on_new_image():
 
 
430
  return (
431
  gr.update(visible=True, value=None),
432
+ gr.update(visible=False, value=None),
433
  gr.update(visible=False, value=None, interactive=False)
434
  )
435
+
436
+ @gr.on(triggers=[video_output.change], inputs=None, outputs=[model_output, stl_download_button])
437
+ def toggle_model_and_stl_visibility():
438
  return (
439
+ gr.update(label="Interactive 3D Model", visible=True),
440
  gr.update(visible=True)
441
  )
442
+
443
  @gr.on(triggers=[video_output.change], inputs=None, outputs=video_output, show_progress="hidden")
444
  def toggle_label():
445
  return gr.update(label="Double Tap To Play", visible=True)
446
 
447
 
 
448
  if __name__ == "__main__":
 
 
449
  if pipeline is None:
450
  logger.critical("Pipeline failed to initialize. Exiting.")
451
+ sys.exit(1)
452
 
453
  running_on_spaces = bool(os.getenv("SPACE_ID"))
454
 
 
479
  debug=True,
480
  share=True,
481
  allowed_paths=["./cache", "./assets"]
482
+ )
wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl → assets/app/basecolor.png RENAMED
File without changes
assets/app/clay.png ADDED

Git LFS Details

  • SHA256: 26fa40f9d5820eabd33f9b08eb9188ce3d99be1b949a58d18aabdb5c4edcb9a0
  • Pointer size: 129 Bytes
  • Size of remote file: 4.46 kB
assets/app/hdri_city.png ADDED

Git LFS Details

  • SHA256: a1eb9ec4139459be1e3959cedda24af4a243e53aa6751b3bd89db69c39f66fa4
  • Pointer size: 129 Bytes
  • Size of remote file: 8.77 kB
assets/app/hdri_courtyard.png ADDED

Git LFS Details

  • SHA256: 65da1890b7c9bf39d6b18e3bc2ae3ae448f685a9881112b3d48bc4eb8c8cf330
  • Pointer size: 129 Bytes
  • Size of remote file: 9.38 kB
assets/app/hdri_forest.png ADDED

Git LFS Details

  • SHA256: d3fe4c0cbc6294838ec74af27f7f125e9506be5b4b705329cb3c5d8f51c8f0f9
  • Pointer size: 130 Bytes
  • Size of remote file: 10.6 kB
assets/app/hdri_interior.png ADDED

Git LFS Details

  • SHA256: 48ae15fb424c73aa2583c96b75cc987788ee78516d4fbf42b93a529f70e74c15
  • Pointer size: 129 Bytes
  • Size of remote file: 9.27 kB
assets/app/hdri_night.png ADDED

Git LFS Details

  • SHA256: ca39a5daa6c8fd22aafda006cd1929019bb1282758dd83e2ec71edc86bbd2287
  • Pointer size: 129 Bytes
  • Size of remote file: 6.93 kB
assets/app/hdri_studio.png ADDED

Git LFS Details

  • SHA256: d1474c10eb41a1a461cba9e946116ce41e8427fb4439696b3433bbe716f752af
  • Pointer size: 129 Bytes
  • Size of remote file: 6.38 kB
assets/app/hdri_sunrise.png ADDED

Git LFS Details

  • SHA256: b1a1081b25c1fe6f0db001ffd9ecd7c12ca041be5779bd90277619b1f8ee8d4f
  • Pointer size: 129 Bytes
  • Size of remote file: 8.4 kB
assets/app/hdri_sunset.png ADDED

Git LFS Details

  • SHA256: 8a2698e58e988b7dda4deaf958f31f2cb0dec4d6e208bbddb09d8e9e33ede734
  • Pointer size: 129 Bytes
  • Size of remote file: 8.1 kB
assets/app/normal.png ADDED

Git LFS Details

  • SHA256: 5bab5ef2defd9c513a9bf838cb04f793eaa916b95d9d07e380fefacd67c0aea0
  • Pointer size: 129 Bytes
  • Size of remote file: 3.86 kB
assets/hdri/city.exr ADDED

Git LFS Details

  • SHA256: 9e42abcd2fa3231e5c2485ca6dd64800534d157b7194ab7fe9cc3bf5a56d0256
  • Pointer size: 131 Bytes
  • Size of remote file: 205 kB
assets/hdri/courtyard.exr ADDED

Git LFS Details

  • SHA256: 6690b47725965531559380121e6878373eb599655e35fefd109a4bd0911366f3
  • Pointer size: 131 Bytes
  • Size of remote file: 255 kB
assets/hdri/forest.exr ADDED

Git LFS Details

  • SHA256: bdf2298244affa0f85509380fd130ac6d4dfaa3c856df065998f7f4c1a93dc0d
  • Pointer size: 131 Bytes
  • Size of remote file: 553 kB
assets/hdri/interior.exr ADDED

Git LFS Details

  • SHA256: e945ff5c1ddd7a3aaf05e9fb5c3bc9cb93c5518414febd44ed2c394e013f0cbd
  • Pointer size: 131 Bytes
  • Size of remote file: 189 kB
assets/hdri/license.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ All HDRIs are licensed as CC0.
2
+
3
+ These were created by Greg Zaal (Poly Haven https://polyhaven.com).
4
+ Originals used for each HDRI:
5
+ - City: https://polyhaven.com/a/portland_landing_pad
6
+ - Courtyard: https://polyhaven.com/a/courtyard
7
+ - Forest: https://polyhaven.com/a/ninomaru_teien
8
+ - Interior: https://polyhaven.com/a/hotel_room
9
+ - Night: Probably https://polyhaven.com/a/moonless_golf
10
+ - Studio: Probably https://polyhaven.com/a/studio_small_01
11
+ - Sunrise: https://polyhaven.com/a/spruit_sunrise
12
+ - Sunset: https://polyhaven.com/a/venice_sunset
13
+
14
+ 1K resolution of each was taken, and compressed with oiiotool:
15
+ oiiotool input.exr --ch R,G,B -d float --compression dwab:300 --clamp:min=0.0:max=32000.0 -o output.exr
assets/hdri/night.exr ADDED

Git LFS Details

  • SHA256: 17480af5547465d160f7307c92585cf30820ca563f87f066395decbad8ac32a4
  • Pointer size: 131 Bytes
  • Size of remote file: 140 kB
assets/hdri/studio.exr ADDED

Git LFS Details

  • SHA256: c36c7fd390b2cdf8693360cf3662cb1d710c95fa799d9f822f42997167839b56
  • Pointer size: 130 Bytes
  • Size of remote file: 98 kB
assets/hdri/sunrise.exr ADDED

Git LFS Details

  • SHA256: 6a1180126e4db7d01f134c5430ea43c1b263ab1a12faf58a444d9ce9c03f3a84
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB
assets/hdri/sunset.exr ADDED

Git LFS Details

  • SHA256: 3bcafdda4f2d7b9759cc1d73004d34d721a274c29b1a0947be88a602dbac426b
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
autotune_cache.json ADDED
The diff for this file is too large to render. See raw diff
 
extensions/nvdiffrast/LICENSE.txt DELETED
@@ -1,97 +0,0 @@
1
- Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
2
-
3
-
4
- Nvidia Source Code License (1-Way Commercial)
5
-
6
- =======================================================================
7
-
8
- 1. Definitions
9
-
10
- "Licensor" means any person or entity that distributes its Work.
11
-
12
- "Software" means the original work of authorship made available under
13
- this License.
14
-
15
- "Work" means the Software and any additions to or derivative works of
16
- the Software that are made available under this License.
17
-
18
- The terms "reproduce," "reproduction," "derivative works," and
19
- "distribution" have the meaning as provided under U.S. copyright law;
20
- provided, however, that for the purposes of this License, derivative
21
- works shall not include works that remain separable from, or merely
22
- link (or bind by name) to the interfaces of, the Work.
23
-
24
- Works, including the Software, are "made available" under this License
25
- by including in or with the Work either (a) a copyright notice
26
- referencing the applicability of this License to the Work, or (b) a
27
- copy of this License.
28
-
29
- 2. License Grants
30
-
31
- 2.1 Copyright Grant. Subject to the terms and conditions of this
32
- License, each Licensor grants to you a perpetual, worldwide,
33
- non-exclusive, royalty-free, copyright license to reproduce,
34
- prepare derivative works of, publicly display, publicly perform,
35
- sublicense and distribute its Work and any resulting derivative
36
- works in any form.
37
-
38
- 3. Limitations
39
-
40
- 3.1 Redistribution. You may reproduce or distribute the Work only
41
- if (a) you do so under this License, (b) you include a complete
42
- copy of this License with your distribution, and (c) you retain
43
- without modification any copyright, patent, trademark, or
44
- attribution notices that are present in the Work.
45
-
46
- 3.2 Derivative Works. You may specify that additional or different
47
- terms apply to the use, reproduction, and distribution of your
48
- derivative works of the Work ("Your Terms") only if (a) Your Terms
49
- provide that the use limitation in Section 3.3 applies to your
50
- derivative works, and (b) you identify the specific derivative
51
- works that are subject to Your Terms. Notwithstanding Your Terms,
52
- this License (including the redistribution requirements in Section
53
- 3.1) will continue to apply to the Work itself.
54
-
55
- 3.3 Use Limitation. The Work and any derivative works thereof only
56
- may be used or intended for use non-commercially. The Work or
57
- derivative works thereof may be used or intended for use by Nvidia
58
- or its affiliates commercially or non-commercially. As used herein,
59
- "non-commercially" means for research or evaluation purposes only
60
- and not for any direct or indirect monetary gain.
61
-
62
- 3.4 Patent Claims. If you bring or threaten to bring a patent claim
63
- against any Licensor (including any claim, cross-claim or
64
- counterclaim in a lawsuit) to enforce any patents that you allege
65
- are infringed by any Work, then your rights under this License from
66
- such Licensor (including the grant in Section 2.1) will terminate
67
- immediately.
68
-
69
- 3.5 Trademarks. This License does not grant any rights to use any
70
- Licensor's or its affiliates' names, logos, or trademarks, except
71
- as necessary to reproduce the notices described in this License.
72
-
73
- 3.6 Termination. If you violate any term of this License, then your
74
- rights under this License (including the grant in Section 2.1) will
75
- terminate immediately.
76
-
77
- 4. Disclaimer of Warranty.
78
-
79
- THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
80
- KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82
- NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83
- THIS LICENSE.
84
-
85
- 5. Limitation of Liability.
86
-
87
- EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
88
- THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
89
- SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
90
- INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
91
- OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
92
- (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
93
- LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
94
- COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
95
- THE POSSIBILITY OF SUCH DAMAGES.
96
-
97
- =======================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/README.md DELETED
@@ -1,42 +0,0 @@
1
- ## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
2
-
3
- ![Teaser image](./docs/img/teaser.png)
4
-
5
- **Modular Primitives for High-Performance Differentiable Rendering**<br>
6
- Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
7
- [http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
8
-
9
- Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
10
- Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
11
-
12
- ## Licenses
13
-
14
- Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
15
-
16
- This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
17
-
18
- For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
19
-
20
- We do not currently accept outside code contributions in the form of pull requests.
21
-
22
- Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
23
- [sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
24
- originally shared under
25
- [MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
26
- Mesh and texture stored as part of `samples/data/earth.npz` are derived from
27
- [3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
28
- model originally made available under
29
- [TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
30
-
31
- ## Citation
32
-
33
- ```
34
- @article{Laine2020diffrast,
35
- title = {Modular Primitives for High-Performance Differentiable Rendering},
36
- author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
37
- journal = {ACM Transactions on Graphics},
38
- year = {2020},
39
- volume = {39},
40
- number = {6}
41
- }
42
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # NVIDIA CORPORATION and its licensors retain all intellectual property
4
- # and proprietary rights in and to this software, related documentation
5
- # and any modifications thereto. Any use, reproduction, disclosure or
6
- # distribution of this software and related documentation without an express
7
- # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- __version__ = '0.3.3'
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/antialias.cu DELETED
@@ -1,558 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "antialias.h"
10
-
11
- //------------------------------------------------------------------------
12
- // Helpers.
13
-
14
- #define F32_MAX (3.402823466e+38f)
15
- static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
16
- static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
17
- static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
18
- {
19
- bool g10 = rational_gt(n1, n0, d1, d0);
20
- bool g20 = rational_gt(n2, n0, d2, d0);
21
- bool g21 = rational_gt(n2, n1, d2, d1);
22
- if (g20 && g21) return 2;
23
- if (g10) return 1;
24
- return 0;
25
- }
26
-
27
- //------------------------------------------------------------------------
28
- // Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
29
-
30
- struct AAWorkItem
31
- {
32
- enum
33
- {
34
- EDGE_MASK = 3, // Edge index in lowest bits.
35
- FLAG_DOWN_BIT = 2, // Down instead of right.
36
- FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
37
- };
38
-
39
- int px, py; // Pixel x, y.
40
- unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
41
- float alpha; // Antialiasing alpha value. Zero if no AA.
42
- };
43
-
44
- //------------------------------------------------------------------------
45
- // Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
46
-
47
- #define JENKINS_MAGIC (0x9e3779b9u)
48
- static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
49
- {
50
- a -= b; a -= c; a ^= (c>>13);
51
- b -= c; b -= a; b ^= (a<<8);
52
- c -= a; c -= b; c ^= (b>>13);
53
- a -= b; a -= c; a ^= (c>>12);
54
- b -= c; b -= a; b ^= (a<<16);
55
- c -= a; c -= b; c ^= (b>>5);
56
- a -= b; a -= c; a ^= (c>>3);
57
- b -= c; b -= a; b ^= (a<<10);
58
- c -= a; c -= b; c ^= (b>>15);
59
- }
60
-
61
- // Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
62
- class HashIndex
63
- {
64
- public:
65
- __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
66
- {
67
- m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
68
- m_idx = (uint32_t)(key & 0xffffffffu);
69
- m_skip = (uint32_t)(key >> 32);
70
- uint32_t dummy = JENKINS_MAGIC;
71
- jenkins_mix(m_idx, m_skip, dummy);
72
- m_idx &= m_mask;
73
- m_skip &= m_mask;
74
- m_skip |= 1;
75
- }
76
- __device__ __forceinline__ int get(void) const { return m_idx; }
77
- __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
78
- private:
79
- uint32_t m_idx, m_skip, m_mask;
80
- };
81
-
82
- static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
83
- {
84
- HashIndex idx(p, key);
85
- while(1)
86
- {
87
- uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
88
- if (prev == 0 || prev == key)
89
- break;
90
- idx.next();
91
- }
92
- int* q = (int*)&p.evHash[idx.get()];
93
- int a = atomicCAS(q+2, 0, v);
94
- if (a != 0 && a != v)
95
- atomicCAS(q+3, 0, v);
96
- }
97
-
98
- static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
99
- {
100
- HashIndex idx(p, key);
101
- while(1)
102
- {
103
- uint4 entry = p.evHash[idx.get()];
104
- uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
105
- if (k == key || k == 0)
106
- return make_int2((int)entry.z, (int)entry.w);
107
- idx.next();
108
- }
109
- }
110
-
111
- static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
112
- {
113
- if (va == vb)
114
- return;
115
-
116
- uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
117
- uint64_t v1 = (uint32_t)max(va, vb) + 1;
118
- uint64_t vk = v0 | (v1 << 32); // hash key
119
- hash_insert(p, vk, vn + 1);
120
- }
121
-
122
- static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
123
- {
124
- if (va == vb)
125
- return -1;
126
-
127
- uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
128
- uint64_t v1 = (uint32_t)max(va, vb) + 1;
129
- uint64_t vk = v0 | (v1 << 32); // hash key
130
- int2 vn = hash_find(p, vk) - 1;
131
- if (vn.x == vr) return vn.y;
132
- if (vn.y == vr) return vn.x;
133
- return -1;
134
- }
135
-
136
- //------------------------------------------------------------------------
137
- // Mesh analysis kernel.
138
-
139
- __global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
140
- {
141
- int idx = threadIdx.x + blockIdx.x * blockDim.x;
142
- if (idx >= p.numTriangles)
143
- return;
144
-
145
- int v0 = p.tri[idx * 3 + 0];
146
- int v1 = p.tri[idx * 3 + 1];
147
- int v2 = p.tri[idx * 3 + 2];
148
-
149
- if (v0 < 0 || v0 >= p.numVertices ||
150
- v1 < 0 || v1 >= p.numVertices ||
151
- v2 < 0 || v2 >= p.numVertices)
152
- return;
153
-
154
- if (v0 == v1 || v1 == v2 || v2 == v0)
155
- return;
156
-
157
- evhash_insert_vertex(p, v1, v2, v0);
158
- evhash_insert_vertex(p, v2, v0, v1);
159
- evhash_insert_vertex(p, v0, v1, v2);
160
- }
161
-
162
- //------------------------------------------------------------------------
163
- // Discontinuity finder kernel.
164
-
165
- __global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
166
- {
167
- // Calculate pixel position.
168
- int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
169
- int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
170
- int pz = blockIdx.z;
171
- if (px >= p.width || py >= p.height || pz >= p.n)
172
- return;
173
-
174
- // Pointer to our TriIdx and fetch.
175
- int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
176
- float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
177
-
178
- // Look right, clamp at edge.
179
- int pidx1 = pidx0;
180
- if (px < p.width - 1)
181
- pidx1 += 4;
182
- float tri1 = p.rasterOut[pidx1];
183
-
184
- // Look down, clamp at edge.
185
- int pidx2 = pidx0;
186
- if (py < p.height - 1)
187
- pidx2 += p.width << 2;
188
- float tri2 = p.rasterOut[pidx2];
189
-
190
- // Determine amount of work.
191
- int count = 0;
192
- if (tri1 != tri0) count = 1;
193
- if (tri2 != tri0) count += 1;
194
- if (!count)
195
- return; // Exit warp.
196
-
197
- // Coalesce work counter update to once per CTA.
198
- __shared__ int s_temp;
199
- s_temp = 0;
200
- __syncthreads();
201
- int idx = atomicAdd(&s_temp, count);
202
- __syncthreads();
203
- if (idx == 0)
204
- {
205
- int base = atomicAdd(&p.workBuffer[0].x, s_temp);
206
- s_temp = base + 1; // don't clobber the counters in first slot.
207
- }
208
- __syncthreads();
209
- idx += s_temp;
210
-
211
- // Write to memory.
212
- if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
213
- if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
214
- }
215
-
216
- //------------------------------------------------------------------------
217
- // Forward analysis kernel.
218
-
219
- __global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
220
- {
221
- __shared__ int s_base;
222
- int workCount = p.workBuffer[0].x;
223
- for(;;)
224
- {
225
- // Persistent threads work fetcher.
226
- __syncthreads();
227
- if (threadIdx.x == 0)
228
- s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
229
- __syncthreads();
230
- int thread_idx = s_base + threadIdx.x;
231
- if (thread_idx >= workCount)
232
- return;
233
-
234
- int4* pItem = p.workBuffer + thread_idx + 1;
235
- int4 item = *pItem;
236
- int px = item.x;
237
- int py = item.y;
238
- int pz = (int)(((unsigned int)item.z) >> 16);
239
- int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
240
-
241
- int pixel0 = px + p.width * (py + p.height * pz);
242
- int pixel1 = pixel0 + (d ? p.width : 1);
243
- float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
244
- float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
245
- int tri0 = float_to_triidx(zt0.y) - 1;
246
- int tri1 = float_to_triidx(zt1.y) - 1;
247
-
248
- // Select triangle based on background / depth.
249
- int tri = (tri0 >= 0) ? tri0 : tri1;
250
- if (tri0 >= 0 && tri1 >= 0)
251
- tri = (zt0.x < zt1.x) ? tri0 : tri1;
252
- if (tri == tri1)
253
- {
254
- // Calculate with respect to neighbor pixel if chose that triangle.
255
- px += 1 - d;
256
- py += d;
257
- }
258
-
259
- // Bail out if triangle index is corrupt.
260
- if (tri < 0 || tri >= p.numTriangles)
261
- continue;
262
-
263
- // Fetch vertex indices.
264
- int vi0 = p.tri[tri * 3 + 0];
265
- int vi1 = p.tri[tri * 3 + 1];
266
- int vi2 = p.tri[tri * 3 + 2];
267
-
268
- // Bail out if vertex indices are corrupt.
269
- if (vi0 < 0 || vi0 >= p.numVertices ||
270
- vi1 < 0 || vi1 >= p.numVertices ||
271
- vi2 < 0 || vi2 >= p.numVertices)
272
- continue;
273
-
274
- // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
275
- int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
276
- int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
277
- int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
278
-
279
- // Instance mode: Adjust vertex indices based on minibatch index.
280
- if (p.instance_mode)
281
- {
282
- int vbase = pz * p.numVertices;
283
- vi0 += vbase;
284
- vi1 += vbase;
285
- vi2 += vbase;
286
- if (op0 >= 0) op0 += vbase;
287
- if (op1 >= 0) op1 += vbase;
288
- if (op2 >= 0) op2 += vbase;
289
- }
290
-
291
- // Fetch vertex positions.
292
- float4 p0 = ((float4*)p.pos)[vi0];
293
- float4 p1 = ((float4*)p.pos)[vi1];
294
- float4 p2 = ((float4*)p.pos)[vi2];
295
- float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
296
- float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
297
- float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
298
-
299
- // Project vertices to pixel space.
300
- float w0 = 1.f / p0.w;
301
- float w1 = 1.f / p1.w;
302
- float w2 = 1.f / p2.w;
303
- float ow0 = 1.f / o0.w;
304
- float ow1 = 1.f / o1.w;
305
- float ow2 = 1.f / o2.w;
306
- float fx = (float)px + .5f - p.xh;
307
- float fy = (float)py + .5f - p.yh;
308
- float x0 = p0.x * w0 * p.xh - fx;
309
- float y0 = p0.y * w0 * p.yh - fy;
310
- float x1 = p1.x * w1 * p.xh - fx;
311
- float y1 = p1.y * w1 * p.yh - fy;
312
- float x2 = p2.x * w2 * p.xh - fx;
313
- float y2 = p2.y * w2 * p.yh - fy;
314
- float ox0 = o0.x * ow0 * p.xh - fx;
315
- float oy0 = o0.y * ow0 * p.yh - fy;
316
- float ox1 = o1.x * ow1 * p.xh - fx;
317
- float oy1 = o1.y * ow1 * p.yh - fy;
318
- float ox2 = o2.x * ow2 * p.xh - fx;
319
- float oy2 = o2.y * ow2 * p.yh - fy;
320
-
321
- // Signs to kill non-silhouette edges.
322
- float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
323
- float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
324
- float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
325
- float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
326
-
327
- // If no matching signs anywhere, skip the rest.
328
- if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
329
- {
330
- // XY flip for horizontal edges.
331
- if (d)
332
- {
333
- swap(x0, y0);
334
- swap(x1, y1);
335
- swap(x2, y2);
336
- }
337
-
338
- float dx0 = x2 - x1;
339
- float dx1 = x0 - x2;
340
- float dx2 = x1 - x0;
341
- float dy0 = y2 - y1;
342
- float dy1 = y0 - y2;
343
- float dy2 = y1 - y0;
344
-
345
- // Check if an edge crosses between us and the neighbor pixel.
346
- float dc = -F32_MAX;
347
- float ds = (tri == tri0) ? 1.f : -1.f;
348
- float d0 = ds * (x1*dy0 - y1*dx0);
349
- float d1 = ds * (x2*dy1 - y2*dx1);
350
- float d2 = ds * (x0*dy2 - y0*dx2);
351
-
352
- if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
353
- if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
354
- if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
355
-
356
- int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
357
- if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
358
- if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
359
- if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
360
- float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
361
-
362
- // Adjust output image if a suitable edge was found.
363
- if (dc > -eps && dc < 1.f + eps)
364
- {
365
- dc = fminf(fmaxf(dc, 0.f), 1.f);
366
- float alpha = ds * (.5f - dc);
367
- const float* pColor0 = p.color + pixel0 * p.channels;
368
- const float* pColor1 = p.color + pixel1 * p.channels;
369
- float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
370
- for (int i=0; i < p.channels; i++)
371
- atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
372
-
373
- // Rewrite the work item's flags and alpha. Keep original px, py.
374
- unsigned int flags = pz << 16;
375
- flags |= di;
376
- flags |= d << AAWorkItem::FLAG_DOWN_BIT;
377
- flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
378
- ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
379
- }
380
- }
381
- }
382
- }
383
-
384
- //------------------------------------------------------------------------
385
- // Gradient kernel.
386
-
387
- __global__ void AntialiasGradKernel(const AntialiasKernelParams p)
388
- {
389
- // Temporary space for coalesced atomics.
390
- CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
391
- __shared__ int s_base; // Work counter communication across entire CTA.
392
-
393
- int workCount = p.workBuffer[0].x;
394
-
395
- for(;;)
396
- {
397
- // Persistent threads work fetcher.
398
- __syncthreads();
399
- if (threadIdx.x == 0)
400
- s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
401
- __syncthreads();
402
- int thread_idx = s_base + threadIdx.x;
403
- if (thread_idx >= workCount)
404
- return;
405
-
406
- // Read work item filled out by forward kernel.
407
- int4 item = p.workBuffer[thread_idx + 1];
408
- unsigned int amask = __ballot_sync(0xffffffffu, item.w);
409
- if (item.w == 0)
410
- continue; // No effect.
411
-
412
- // Unpack work item and replicate setup from forward analysis kernel.
413
- int px = item.x;
414
- int py = item.y;
415
- int pz = (int)(((unsigned int)item.z) >> 16);
416
- int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
417
- float alpha = __int_as_float(item.w);
418
- int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
419
- int di = item.z & AAWorkItem::EDGE_MASK;
420
- float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
421
- int pixel0 = px + p.width * (py + p.height * pz);
422
- int pixel1 = pixel0 + (d ? p.width : 1);
423
- int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
424
- if (tri1)
425
- {
426
- px += 1 - d;
427
- py += d;
428
- }
429
-
430
- // Bail out if triangle index is corrupt.
431
- bool triFail = (tri < 0 || tri >= p.numTriangles);
432
- amask = __ballot_sync(amask, !triFail);
433
- if (triFail)
434
- continue;
435
-
436
- // Outgoing color gradients.
437
- float* pGrad0 = p.gradColor + pixel0 * p.channels;
438
- float* pGrad1 = p.gradColor + pixel1 * p.channels;
439
-
440
- // Incoming color gradients.
441
- const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
442
-
443
- // Position gradient weight based on colors and incoming gradients.
444
- float dd = 0.f;
445
- const float* pColor0 = p.color + pixel0 * p.channels;
446
- const float* pColor1 = p.color + pixel1 * p.channels;
447
-
448
- // Loop over channels and accumulate.
449
- for (int i=0; i < p.channels; i++)
450
- {
451
- float dy = pDy[i];
452
- if (dy != 0.f)
453
- {
454
- // Update position gradient weight.
455
- dd += dy * (pColor1[i] - pColor0[i]);
456
-
457
- // Update color gradients. No coalescing because all have different targets.
458
- float v = alpha * dy;
459
- atomicAdd(&pGrad0[i], -v);
460
- atomicAdd(&pGrad1[i], v);
461
- }
462
- }
463
-
464
- // If position weight is zero, skip the rest.
465
- bool noGrad = (dd == 0.f);
466
- amask = __ballot_sync(amask, !noGrad);
467
- if (noGrad)
468
- continue;
469
-
470
- // Fetch vertex indices of the active edge and their positions.
471
- int i1 = (di < 2) ? (di + 1) : 0;
472
- int i2 = (i1 < 2) ? (i1 + 1) : 0;
473
- int vi1 = p.tri[3 * tri + i1];
474
- int vi2 = p.tri[3 * tri + i2];
475
-
476
- // Bail out if vertex indices are corrupt.
477
- bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
478
- amask = __ballot_sync(amask, !vtxFail);
479
- if (vtxFail)
480
- continue;
481
-
482
- // Instance mode: Adjust vertex indices based on minibatch index.
483
- if (p.instance_mode)
484
- {
485
- vi1 += pz * p.numVertices;
486
- vi2 += pz * p.numVertices;
487
- }
488
-
489
- // Fetch vertex positions.
490
- float4 p1 = ((float4*)p.pos)[vi1];
491
- float4 p2 = ((float4*)p.pos)[vi2];
492
-
493
- // Project vertices to pixel space.
494
- float pxh = p.xh;
495
- float pyh = p.yh;
496
- float fx = (float)px + .5f - pxh;
497
- float fy = (float)py + .5f - pyh;
498
-
499
- // XY flip for horizontal edges.
500
- if (d)
501
- {
502
- swap(p1.x, p1.y);
503
- swap(p2.x, p2.y);
504
- swap(pxh, pyh);
505
- swap(fx, fy);
506
- }
507
-
508
- // Gradient calculation setup.
509
- float w1 = 1.f / p1.w;
510
- float w2 = 1.f / p2.w;
511
- float x1 = p1.x * w1 * pxh - fx;
512
- float y1 = p1.y * w1 * pyh - fy;
513
- float x2 = p2.x * w2 * pxh - fx;
514
- float y2 = p2.y * w2 * pyh - fy;
515
- float dx = x2 - x1;
516
- float dy = y2 - y1;
517
- float db = x1*dy - y1*dx;
518
-
519
- // Compute inverse delta-y with epsilon.
520
- float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
521
- float iy = 1.f / (dy + ep);
522
-
523
- // Compute position gradients.
524
- float dby = db * iy;
525
- float iw1 = -w1 * iy * dd;
526
- float iw2 = w2 * iy * dd;
527
- float gp1x = iw1 * pxh * y2;
528
- float gp2x = iw2 * pxh * y1;
529
- float gp1y = iw1 * pyh * (dby - x2);
530
- float gp2y = iw2 * pyh * (dby - x1);
531
- float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
532
- float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
533
-
534
- // XY flip the gradients.
535
- if (d)
536
- {
537
- swap(gp1x, gp1y);
538
- swap(gp2x, gp2y);
539
- }
540
-
541
- // Kill position gradients if alpha was saturated.
542
- if (fabsf(alpha) >= 0.5f)
543
- {
544
- gp1x = gp1y = gp1w = 0.f;
545
- gp2x = gp2y = gp2w = 0.f;
546
- }
547
-
548
- // Initialize coalesced atomics. Match both triangle ID and edge index.
549
- // Also note that some threads may be inactive.
550
- CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
551
-
552
- // Accumulate gradients.
553
- caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
554
- caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
555
- }
556
- }
557
-
558
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/antialias.h DELETED
@@ -1,50 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include "common.h"
11
-
12
- //------------------------------------------------------------------------
13
- // Constants and helpers.
14
-
15
- #define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH 32
16
- #define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT 8
17
- #define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK 256
18
- #define AA_MESH_KERNEL_THREADS_PER_BLOCK 256
19
- #define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
20
- #define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 2 : 3)
21
- #define AA_GRAD_KERNEL_THREADS_PER_BLOCK 256
22
-
23
- //------------------------------------------------------------------------
24
- // CUDA kernel params.
25
-
26
- struct AntialiasKernelParams
27
- {
28
- const float* color; // Incoming color buffer.
29
- const float* rasterOut; // Incoming rasterizer output buffer.
30
- const int* tri; // Incoming triangle buffer.
31
- const float* pos; // Incoming position buffer.
32
- float* output; // Output buffer of forward kernel.
33
- const float* dy; // Incoming gradients.
34
- float* gradColor; // Output buffer, color gradient.
35
- float* gradPos; // Output buffer, position gradient.
36
- int4* workBuffer; // Buffer for storing intermediate work items. First item reserved for counters.
37
- uint4* evHash; // Edge-vertex hash.
38
- int allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
39
- int numTriangles; // Number of triangles.
40
- int numVertices; // Number of vertices.
41
- int width; // Input width.
42
- int height; // Input height.
43
- int n; // Minibatch size.
44
- int channels; // Channel count in color input.
45
- float xh, yh; // Transfer to pixel space.
46
- int instance_mode; // 0=normal, 1=instance mode.
47
- int tri_const; // 1 if triangle array is known to be constant.
48
- };
49
-
50
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/common.cpp DELETED
@@ -1,60 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include <cuda_runtime.h>
10
-
11
- //------------------------------------------------------------------------
12
- // Block and grid size calculators for kernel launches.
13
-
14
- dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
15
- {
16
- int maxThreads = maxWidth * maxHeight;
17
- if (maxThreads <= 1 || (width * height) <= 1)
18
- return dim3(1, 1, 1); // Degenerate.
19
-
20
- // Start from max size.
21
- int bw = maxWidth;
22
- int bh = maxHeight;
23
-
24
- // Optimizations for weirdly sized buffers.
25
- if (width < bw)
26
- {
27
- // Decrease block width to smallest power of two that covers the buffer width.
28
- while ((bw >> 1) >= width)
29
- bw >>= 1;
30
-
31
- // Maximize height.
32
- bh = maxThreads / bw;
33
- if (bh > height)
34
- bh = height;
35
- }
36
- else if (height < bh)
37
- {
38
- // Halve height and double width until fits completely inside buffer vertically.
39
- while (bh > height)
40
- {
41
- bh >>= 1;
42
- if (bw < width)
43
- bw <<= 1;
44
- }
45
- }
46
-
47
- // Done.
48
- return dim3(bw, bh, 1);
49
- }
50
-
51
- dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
52
- {
53
- dim3 gridSize;
54
- gridSize.x = (width - 1) / blockSize.x + 1;
55
- gridSize.y = (height - 1) / blockSize.y + 1;
56
- gridSize.z = (depth - 1) / blockSize.z + 1;
57
- return gridSize;
58
- }
59
-
60
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/common.h DELETED
@@ -1,263 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include <cuda.h>
11
- #include <stdint.h>
12
-
13
- //------------------------------------------------------------------------
14
- // C++ helper function prototypes.
15
-
16
- dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
17
- dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
18
-
19
- //------------------------------------------------------------------------
20
- // The rest is CUDA device code specific stuff.
21
-
22
- #ifdef __CUDACC__
23
-
24
- //------------------------------------------------------------------------
25
- // Helpers for CUDA vector types.
26
-
27
- static __device__ __forceinline__ float2& operator*= (float2& a, const float2& b) { a.x *= b.x; a.y *= b.y; return a; }
28
- static __device__ __forceinline__ float2& operator+= (float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; }
29
- static __device__ __forceinline__ float2& operator-= (float2& a, const float2& b) { a.x -= b.x; a.y -= b.y; return a; }
30
- static __device__ __forceinline__ float2& operator*= (float2& a, float b) { a.x *= b; a.y *= b; return a; }
31
- static __device__ __forceinline__ float2& operator+= (float2& a, float b) { a.x += b; a.y += b; return a; }
32
- static __device__ __forceinline__ float2& operator-= (float2& a, float b) { a.x -= b; a.y -= b; return a; }
33
- static __device__ __forceinline__ float2 operator* (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
34
- static __device__ __forceinline__ float2 operator+ (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
35
- static __device__ __forceinline__ float2 operator- (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
36
- static __device__ __forceinline__ float2 operator* (const float2& a, float b) { return make_float2(a.x * b, a.y * b); }
37
- static __device__ __forceinline__ float2 operator+ (const float2& a, float b) { return make_float2(a.x + b, a.y + b); }
38
- static __device__ __forceinline__ float2 operator- (const float2& a, float b) { return make_float2(a.x - b, a.y - b); }
39
- static __device__ __forceinline__ float2 operator* (float a, const float2& b) { return make_float2(a * b.x, a * b.y); }
40
- static __device__ __forceinline__ float2 operator+ (float a, const float2& b) { return make_float2(a + b.x, a + b.y); }
41
- static __device__ __forceinline__ float2 operator- (float a, const float2& b) { return make_float2(a - b.x, a - b.y); }
42
- static __device__ __forceinline__ float2 operator- (const float2& a) { return make_float2(-a.x, -a.y); }
43
- static __device__ __forceinline__ float3& operator*= (float3& a, const float3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
44
- static __device__ __forceinline__ float3& operator+= (float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
45
- static __device__ __forceinline__ float3& operator-= (float3& a, const float3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
46
- static __device__ __forceinline__ float3& operator*= (float3& a, float b) { a.x *= b; a.y *= b; a.z *= b; return a; }
47
- static __device__ __forceinline__ float3& operator+= (float3& a, float b) { a.x += b; a.y += b; a.z += b; return a; }
48
- static __device__ __forceinline__ float3& operator-= (float3& a, float b) { a.x -= b; a.y -= b; a.z -= b; return a; }
49
- static __device__ __forceinline__ float3 operator* (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
50
- static __device__ __forceinline__ float3 operator+ (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
51
- static __device__ __forceinline__ float3 operator- (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
52
- static __device__ __forceinline__ float3 operator* (const float3& a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); }
53
- static __device__ __forceinline__ float3 operator+ (const float3& a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); }
54
- static __device__ __forceinline__ float3 operator- (const float3& a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); }
55
- static __device__ __forceinline__ float3 operator* (float a, const float3& b) { return make_float3(a * b.x, a * b.y, a * b.z); }
56
- static __device__ __forceinline__ float3 operator+ (float a, const float3& b) { return make_float3(a + b.x, a + b.y, a + b.z); }
57
- static __device__ __forceinline__ float3 operator- (float a, const float3& b) { return make_float3(a - b.x, a - b.y, a - b.z); }
58
- static __device__ __forceinline__ float3 operator- (const float3& a) { return make_float3(-a.x, -a.y, -a.z); }
59
- static __device__ __forceinline__ float4& operator*= (float4& a, const float4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
60
- static __device__ __forceinline__ float4& operator+= (float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
61
- static __device__ __forceinline__ float4& operator-= (float4& a, const float4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
62
- static __device__ __forceinline__ float4& operator*= (float4& a, float b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
63
- static __device__ __forceinline__ float4& operator+= (float4& a, float b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
64
- static __device__ __forceinline__ float4& operator-= (float4& a, float b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
65
- static __device__ __forceinline__ float4 operator* (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66
- static __device__ __forceinline__ float4 operator+ (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
67
- static __device__ __forceinline__ float4 operator- (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
68
- static __device__ __forceinline__ float4 operator* (const float4& a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
69
- static __device__ __forceinline__ float4 operator+ (const float4& a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
70
- static __device__ __forceinline__ float4 operator- (const float4& a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
71
- static __device__ __forceinline__ float4 operator* (float a, const float4& b) { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
72
- static __device__ __forceinline__ float4 operator+ (float a, const float4& b) { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
73
- static __device__ __forceinline__ float4 operator- (float a, const float4& b) { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
74
- static __device__ __forceinline__ float4 operator- (const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); }
75
- static __device__ __forceinline__ int2& operator*= (int2& a, const int2& b) { a.x *= b.x; a.y *= b.y; return a; }
76
- static __device__ __forceinline__ int2& operator+= (int2& a, const int2& b) { a.x += b.x; a.y += b.y; return a; }
77
- static __device__ __forceinline__ int2& operator-= (int2& a, const int2& b) { a.x -= b.x; a.y -= b.y; return a; }
78
- static __device__ __forceinline__ int2& operator*= (int2& a, int b) { a.x *= b; a.y *= b; return a; }
79
- static __device__ __forceinline__ int2& operator+= (int2& a, int b) { a.x += b; a.y += b; return a; }
80
- static __device__ __forceinline__ int2& operator-= (int2& a, int b) { a.x -= b; a.y -= b; return a; }
81
- static __device__ __forceinline__ int2 operator* (const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); }
82
- static __device__ __forceinline__ int2 operator+ (const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); }
83
- static __device__ __forceinline__ int2 operator- (const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); }
84
- static __device__ __forceinline__ int2 operator* (const int2& a, int b) { return make_int2(a.x * b, a.y * b); }
85
- static __device__ __forceinline__ int2 operator+ (const int2& a, int b) { return make_int2(a.x + b, a.y + b); }
86
- static __device__ __forceinline__ int2 operator- (const int2& a, int b) { return make_int2(a.x - b, a.y - b); }
87
- static __device__ __forceinline__ int2 operator* (int a, const int2& b) { return make_int2(a * b.x, a * b.y); }
88
- static __device__ __forceinline__ int2 operator+ (int a, const int2& b) { return make_int2(a + b.x, a + b.y); }
89
- static __device__ __forceinline__ int2 operator- (int a, const int2& b) { return make_int2(a - b.x, a - b.y); }
90
- static __device__ __forceinline__ int2 operator- (const int2& a) { return make_int2(-a.x, -a.y); }
91
- static __device__ __forceinline__ int3& operator*= (int3& a, const int3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
92
- static __device__ __forceinline__ int3& operator+= (int3& a, const int3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
93
- static __device__ __forceinline__ int3& operator-= (int3& a, const int3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
94
- static __device__ __forceinline__ int3& operator*= (int3& a, int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
95
- static __device__ __forceinline__ int3& operator+= (int3& a, int b) { a.x += b; a.y += b; a.z += b; return a; }
96
- static __device__ __forceinline__ int3& operator-= (int3& a, int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
97
- static __device__ __forceinline__ int3 operator* (const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
98
- static __device__ __forceinline__ int3 operator+ (const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
99
- static __device__ __forceinline__ int3 operator- (const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
100
- static __device__ __forceinline__ int3 operator* (const int3& a, int b) { return make_int3(a.x * b, a.y * b, a.z * b); }
101
- static __device__ __forceinline__ int3 operator+ (const int3& a, int b) { return make_int3(a.x + b, a.y + b, a.z + b); }
102
- static __device__ __forceinline__ int3 operator- (const int3& a, int b) { return make_int3(a.x - b, a.y - b, a.z - b); }
103
- static __device__ __forceinline__ int3 operator* (int a, const int3& b) { return make_int3(a * b.x, a * b.y, a * b.z); }
104
- static __device__ __forceinline__ int3 operator+ (int a, const int3& b) { return make_int3(a + b.x, a + b.y, a + b.z); }
105
- static __device__ __forceinline__ int3 operator- (int a, const int3& b) { return make_int3(a - b.x, a - b.y, a - b.z); }
106
- static __device__ __forceinline__ int3 operator- (const int3& a) { return make_int3(-a.x, -a.y, -a.z); }
107
- static __device__ __forceinline__ int4& operator*= (int4& a, const int4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
108
- static __device__ __forceinline__ int4& operator+= (int4& a, const int4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
109
- static __device__ __forceinline__ int4& operator-= (int4& a, const int4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
110
- static __device__ __forceinline__ int4& operator*= (int4& a, int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
111
- static __device__ __forceinline__ int4& operator+= (int4& a, int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
112
- static __device__ __forceinline__ int4& operator-= (int4& a, int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
113
- static __device__ __forceinline__ int4 operator* (const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
114
- static __device__ __forceinline__ int4 operator+ (const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
115
- static __device__ __forceinline__ int4 operator- (const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
116
- static __device__ __forceinline__ int4 operator* (const int4& a, int b) { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
117
- static __device__ __forceinline__ int4 operator+ (const int4& a, int b) { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
118
- static __device__ __forceinline__ int4 operator- (const int4& a, int b) { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
119
- static __device__ __forceinline__ int4 operator* (int a, const int4& b) { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
120
- static __device__ __forceinline__ int4 operator+ (int a, const int4& b) { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
121
- static __device__ __forceinline__ int4 operator- (int a, const int4& b) { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
122
- static __device__ __forceinline__ int4 operator- (const int4& a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
123
- static __device__ __forceinline__ uint2& operator*= (uint2& a, const uint2& b) { a.x *= b.x; a.y *= b.y; return a; }
124
- static __device__ __forceinline__ uint2& operator+= (uint2& a, const uint2& b) { a.x += b.x; a.y += b.y; return a; }
125
- static __device__ __forceinline__ uint2& operator-= (uint2& a, const uint2& b) { a.x -= b.x; a.y -= b.y; return a; }
126
- static __device__ __forceinline__ uint2& operator*= (uint2& a, unsigned int b) { a.x *= b; a.y *= b; return a; }
127
- static __device__ __forceinline__ uint2& operator+= (uint2& a, unsigned int b) { a.x += b; a.y += b; return a; }
128
- static __device__ __forceinline__ uint2& operator-= (uint2& a, unsigned int b) { a.x -= b; a.y -= b; return a; }
129
- static __device__ __forceinline__ uint2 operator* (const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); }
130
- static __device__ __forceinline__ uint2 operator+ (const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); }
131
- static __device__ __forceinline__ uint2 operator- (const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); }
132
- static __device__ __forceinline__ uint2 operator* (const uint2& a, unsigned int b) { return make_uint2(a.x * b, a.y * b); }
133
- static __device__ __forceinline__ uint2 operator+ (const uint2& a, unsigned int b) { return make_uint2(a.x + b, a.y + b); }
134
- static __device__ __forceinline__ uint2 operator- (const uint2& a, unsigned int b) { return make_uint2(a.x - b, a.y - b); }
135
- static __device__ __forceinline__ uint2 operator* (unsigned int a, const uint2& b) { return make_uint2(a * b.x, a * b.y); }
136
- static __device__ __forceinline__ uint2 operator+ (unsigned int a, const uint2& b) { return make_uint2(a + b.x, a + b.y); }
137
- static __device__ __forceinline__ uint2 operator- (unsigned int a, const uint2& b) { return make_uint2(a - b.x, a - b.y); }
138
- static __device__ __forceinline__ uint3& operator*= (uint3& a, const uint3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
139
- static __device__ __forceinline__ uint3& operator+= (uint3& a, const uint3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
140
- static __device__ __forceinline__ uint3& operator-= (uint3& a, const uint3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
141
- static __device__ __forceinline__ uint3& operator*= (uint3& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
142
- static __device__ __forceinline__ uint3& operator+= (uint3& a, unsigned int b) { a.x += b; a.y += b; a.z += b; return a; }
143
- static __device__ __forceinline__ uint3& operator-= (uint3& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
144
- static __device__ __forceinline__ uint3 operator* (const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
145
- static __device__ __forceinline__ uint3 operator+ (const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
146
- static __device__ __forceinline__ uint3 operator- (const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
147
- static __device__ __forceinline__ uint3 operator* (const uint3& a, unsigned int b) { return make_uint3(a.x * b, a.y * b, a.z * b); }
148
- static __device__ __forceinline__ uint3 operator+ (const uint3& a, unsigned int b) { return make_uint3(a.x + b, a.y + b, a.z + b); }
149
- static __device__ __forceinline__ uint3 operator- (const uint3& a, unsigned int b) { return make_uint3(a.x - b, a.y - b, a.z - b); }
150
- static __device__ __forceinline__ uint3 operator* (unsigned int a, const uint3& b) { return make_uint3(a * b.x, a * b.y, a * b.z); }
151
- static __device__ __forceinline__ uint3 operator+ (unsigned int a, const uint3& b) { return make_uint3(a + b.x, a + b.y, a + b.z); }
152
- static __device__ __forceinline__ uint3 operator- (unsigned int a, const uint3& b) { return make_uint3(a - b.x, a - b.y, a - b.z); }
153
- static __device__ __forceinline__ uint4& operator*= (uint4& a, const uint4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
154
- static __device__ __forceinline__ uint4& operator+= (uint4& a, const uint4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
155
- static __device__ __forceinline__ uint4& operator-= (uint4& a, const uint4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
156
- static __device__ __forceinline__ uint4& operator*= (uint4& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
157
- static __device__ __forceinline__ uint4& operator+= (uint4& a, unsigned int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
158
- static __device__ __forceinline__ uint4& operator-= (uint4& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
159
- static __device__ __forceinline__ uint4 operator* (const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
160
- static __device__ __forceinline__ uint4 operator+ (const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
161
- static __device__ __forceinline__ uint4 operator- (const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
162
- static __device__ __forceinline__ uint4 operator* (const uint4& a, unsigned int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
163
- static __device__ __forceinline__ uint4 operator+ (const uint4& a, unsigned int b) { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
164
- static __device__ __forceinline__ uint4 operator- (const uint4& a, unsigned int b) { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
165
- static __device__ __forceinline__ uint4 operator* (unsigned int a, const uint4& b) { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
166
- static __device__ __forceinline__ uint4 operator+ (unsigned int a, const uint4& b) { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
167
- static __device__ __forceinline__ uint4 operator- (unsigned int a, const uint4& b) { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
168
-
169
- template<class T> static __device__ __forceinline__ T zero_value(void);
170
- template<> __device__ __forceinline__ float zero_value<float> (void) { return 0.f; }
171
- template<> __device__ __forceinline__ float2 zero_value<float2>(void) { return make_float2(0.f, 0.f); }
172
- template<> __device__ __forceinline__ float4 zero_value<float4>(void) { return make_float4(0.f, 0.f, 0.f, 0.f); }
173
- static __device__ __forceinline__ float3 make_float3(const float2& a, float b) { return make_float3(a.x, a.y, b); }
174
- static __device__ __forceinline__ float4 make_float4(const float3& a, float b) { return make_float4(a.x, a.y, a.z, b); }
175
- static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b) { return make_float4(a.x, a.y, b.x, b.y); }
176
- static __device__ __forceinline__ int3 make_int3(const int2& a, int b) { return make_int3(a.x, a.y, b); }
177
- static __device__ __forceinline__ int4 make_int4(const int3& a, int b) { return make_int4(a.x, a.y, a.z, b); }
178
- static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b) { return make_int4(a.x, a.y, b.x, b.y); }
179
- static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b) { return make_uint3(a.x, a.y, b); }
180
- static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b) { return make_uint4(a.x, a.y, a.z, b); }
181
- static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b) { return make_uint4(a.x, a.y, b.x, b.y); }
182
-
183
- template<class T> static __device__ __forceinline__ void swap(T& a, T& b) { T temp = a; a = b; b = temp; }
184
-
185
- //------------------------------------------------------------------------
186
- // Triangle ID <-> float32 conversion functions to support very large triangle IDs.
187
- //
188
- // Values up to and including 16777216 (also, negative values) are converted trivially and retain
189
- // compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
190
- // the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
191
-
192
- static __device__ __forceinline__ int float_to_triidx(float x) { if (x <= 16777216.f) return (int)x; return __float_as_int(x) - 0x4a800000; }
193
- static __device__ __forceinline__ float triidx_to_float(int x) { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
194
-
195
- //------------------------------------------------------------------------
196
- // Coalesced atomics. These are all done via macros.
197
-
198
- #if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
199
-
200
- #define CA_TEMP _ca_temp
201
- #define CA_TEMP_PARAM float* CA_TEMP
202
- #define CA_DECLARE_TEMP(threads_per_block) \
203
- __shared__ float CA_TEMP[(threads_per_block)]
204
-
205
- #define CA_SET_GROUP_MASK(group, thread_mask) \
206
- bool _ca_leader; \
207
- float* _ca_ptr; \
208
- do { \
209
- int tidx = threadIdx.x + blockDim.x * threadIdx.y; \
210
- int lane = tidx & 31; \
211
- int warp = tidx >> 5; \
212
- int tmask = __match_any_sync((thread_mask), (group)); \
213
- int leader = __ffs(tmask) - 1; \
214
- _ca_leader = (leader == lane); \
215
- _ca_ptr = &_ca_temp[((warp << 5) + leader)]; \
216
- } while(0)
217
-
218
- #define CA_SET_GROUP(group) \
219
- CA_SET_GROUP_MASK((group), 0xffffffffu)
220
-
221
- #define caAtomicAdd(ptr, value) \
222
- do { \
223
- if (_ca_leader) \
224
- *_ca_ptr = 0.f; \
225
- atomicAdd(_ca_ptr, (value)); \
226
- if (_ca_leader) \
227
- atomicAdd((ptr), *_ca_ptr); \
228
- } while(0)
229
-
230
- #define caAtomicAdd3_xyw(ptr, x, y, w) \
231
- do { \
232
- caAtomicAdd((ptr), (x)); \
233
- caAtomicAdd((ptr)+1, (y)); \
234
- caAtomicAdd((ptr)+3, (w)); \
235
- } while(0)
236
-
237
- #define caAtomicAddTexture(ptr, level, idx, value) \
238
- do { \
239
- CA_SET_GROUP((idx) ^ ((level) << 27)); \
240
- caAtomicAdd((ptr)+(idx), (value)); \
241
- } while(0)
242
-
243
- //------------------------------------------------------------------------
244
- // Disable atomic coalescing for compute capability lower than 7.x
245
-
246
- #else // __CUDA_ARCH__ >= 700
247
- #define CA_TEMP _ca_temp
248
- #define CA_TEMP_PARAM float CA_TEMP
249
- #define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
250
- #define CA_SET_GROUP_MASK(group, thread_mask)
251
- #define CA_SET_GROUP(group)
252
- #define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
253
- #define caAtomicAdd3_xyw(ptr, x, y, w) \
254
- do { \
255
- atomicAdd((ptr), (x)); \
256
- atomicAdd((ptr)+1, (y)); \
257
- atomicAdd((ptr)+3, (w)); \
258
- } while(0)
259
- #define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
260
- #endif // __CUDA_ARCH__ >= 700
261
-
262
- //------------------------------------------------------------------------
263
- #endif // __CUDACC__
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp DELETED
@@ -1,63 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
-
11
- //------------------------------------------------------------------------
12
- // This is a slimmed-down and modernized version of the original
13
- // CudaRaster codebase that accompanied the HPG 2011 paper
14
- // "High-Performance Software Rasterization on GPUs" by Laine and Karras.
15
- // Modifications have been made to accommodate post-Volta execution model
16
- // with warp divergence. Support for shading, blending, quad rendering,
17
- // and supersampling have been removed as unnecessary for nvdiffrast.
18
- //------------------------------------------------------------------------
19
-
20
- namespace CR
21
- {
22
-
23
- class RasterImpl;
24
-
25
- //------------------------------------------------------------------------
26
- // Interface class to isolate user from implementation details.
27
- //------------------------------------------------------------------------
28
-
29
- class CudaRaster
30
- {
31
- public:
32
- enum
33
- {
34
- RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
35
- RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
36
- };
37
-
38
- public:
39
- CudaRaster (void);
40
- ~CudaRaster (void);
41
-
42
- void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
43
- void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
44
- void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
45
- void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
46
- void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
47
- void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
48
- bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
49
- void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
50
- void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
51
- void swapDepthAndPeel (void); // Swap depth and peeling buffers.
52
-
53
- private:
54
- CudaRaster (const CudaRaster&); // forbidden
55
- CudaRaster& operator= (const CudaRaster&); // forbidden
56
-
57
- private:
58
- RasterImpl* m_impl; // Opaque pointer to implementation.
59
- };
60
-
61
- //------------------------------------------------------------------------
62
- } // namespace CR
63
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl DELETED
@@ -1,423 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
-
11
- __device__ __inline__ void binRasterImpl(const CRParams p)
12
- {
13
- __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
14
- __shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
15
- __shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
16
- __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
17
- __shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
18
- __shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
19
- __shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
20
- __shared__ volatile U32 s_batchPos;
21
- __shared__ volatile U32 s_bufCount;
22
- __shared__ volatile U32 s_overTotal;
23
- __shared__ volatile U32 s_allocBase;
24
-
25
- const CRImageParams& ip = getImageParams(p, blockIdx.z);
26
- CRAtomics& atomics = p.atomics[blockIdx.z];
27
- const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
28
- const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
29
-
30
- S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
31
- S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
32
- S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
33
- S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
34
- S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
35
-
36
- if (atomics.numSubtris > p.maxSubtris)
37
- return;
38
-
39
- // per-thread state
40
- int thrInBlock = threadIdx.x + threadIdx.y * 32;
41
- int batchPos = 0;
42
-
43
- // first 16 elements of s_broadcast are always zero
44
- if (thrInBlock < 16)
45
- s_broadcast[thrInBlock] = 0;
46
-
47
- // initialize output linked lists and offsets
48
- if (thrInBlock < p.numBins)
49
- {
50
- binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
51
- s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
52
- s_outTotal[thrInBlock] = 0;
53
- }
54
-
55
- // repeat until done
56
- for(;;)
57
- {
58
- // get batch
59
- if (thrInBlock == 0)
60
- s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
61
- __syncthreads();
62
- batchPos = s_batchPos;
63
-
64
- // all batches done?
65
- if (batchPos >= ip.triCount)
66
- break;
67
-
68
- // per-thread state
69
- int bufIndex = 0;
70
- int bufCount = 0;
71
- int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
72
-
73
- // loop over batch as long as we have triangles in it
74
- do
75
- {
76
- // read more triangles
77
- while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
78
- {
79
- // get subtriangle count
80
-
81
- int triIdx = batchPos + thrInBlock;
82
- int num = 0;
83
- if (triIdx < batchEnd)
84
- num = triSubtris[triIdx];
85
-
86
- // cumulative sum of subtriangles within each warp
87
- U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
88
- if (__any_sync(~0u, num > 1))
89
- {
90
- myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
91
- myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
92
- }
93
- if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
94
- s_broadcast[threadIdx.y + 16] = myIdx + num;
95
- __syncthreads();
96
-
97
- // cumulative sum of per-warp subtriangle counts
98
- // Note: cannot have more than 32 warps or this needs to sync between each step.
99
- bool act = (thrInBlock < CR_BIN_WARPS);
100
- U32 actMask = __ballot_sync(~0u, act);
101
- if (threadIdx.y == 0 && act)
102
- {
103
- volatile U32* ptr = &s_broadcast[thrInBlock + 16];
104
- U32 val = *ptr;
105
- #if (CR_BIN_WARPS > 1)
106
- val += ptr[-1]; __syncwarp(actMask);
107
- *ptr = val; __syncwarp(actMask);
108
- #endif
109
- #if (CR_BIN_WARPS > 2)
110
- val += ptr[-2]; __syncwarp(actMask);
111
- *ptr = val; __syncwarp(actMask);
112
- #endif
113
- #if (CR_BIN_WARPS > 4)
114
- val += ptr[-4]; __syncwarp(actMask);
115
- *ptr = val; __syncwarp(actMask);
116
- #endif
117
- #if (CR_BIN_WARPS > 8)
118
- val += ptr[-8]; __syncwarp(actMask);
119
- *ptr = val; __syncwarp(actMask);
120
- #endif
121
- #if (CR_BIN_WARPS > 16)
122
- val += ptr[-16]; __syncwarp(actMask);
123
- *ptr = val; __syncwarp(actMask);
124
- #endif
125
-
126
- // initially assume that we consume everything
127
- // only last active thread does the writes
128
- if (threadIdx.x == CR_BIN_WARPS - 1)
129
- {
130
- s_batchPos = batchPos + CR_BIN_WARPS * 32;
131
- s_bufCount = bufCount + val;
132
- }
133
- }
134
- __syncthreads();
135
-
136
- // skip if no subtriangles
137
- if (num)
138
- {
139
- // calculate write position for first subtriangle
140
- U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
141
-
142
- // only write if entire triangle fits
143
- if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
144
- {
145
- pos += bufIndex; // adjust for current start position
146
- pos &= CR_ARRAY_SIZE(s_triBuf)-1;
147
- if (num == 1)
148
- s_triBuf[pos] = triIdx * 8 + 7; // single triangle
149
- else
150
- {
151
- for (int i=0; i < num; i++)
152
- {
153
- s_triBuf[pos] = triIdx * 8 + i;
154
- pos++;
155
- pos &= CR_ARRAY_SIZE(s_triBuf)-1;
156
- }
157
- }
158
- } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
159
- {
160
- // this triangle is the first that failed, overwrite total count and triangle count
161
- s_batchPos = batchPos + thrInBlock;
162
- s_bufCount = pos;
163
- }
164
- }
165
-
166
- // update triangle counts
167
- __syncthreads();
168
- batchPos = s_batchPos;
169
- bufCount = s_bufCount;
170
- }
171
-
172
- // make every warp clear its output buffers
173
- for (int i=threadIdx.x; i < p.numBins; i += 32)
174
- s_outMask[threadIdx.y][i] = 0;
175
- __syncwarp();
176
-
177
- // choose our triangle
178
- uint4 triData = make_uint4(0, 0, 0, 0);
179
- if (thrInBlock < bufCount)
180
- {
181
- U32 triPos = bufIndex + thrInBlock;
182
- triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
183
-
184
- // find triangle
185
- int triIdx = s_triBuf[triPos];
186
- int dataIdx = triIdx >> 3;
187
- int subtriIdx = triIdx & 7;
188
- if (subtriIdx != 7)
189
- dataIdx = triHeader[dataIdx].misc + subtriIdx;
190
-
191
- // read triangle
192
-
193
- triData = *(((const uint4*)triHeader) + dataIdx);
194
- }
195
-
196
- // setup bounding box and edge functions, and rasterize
197
- S32 lox, loy, hix, hiy;
198
- bool hasTri = (thrInBlock < bufCount);
199
- U32 hasTriMask = __ballot_sync(~0u, hasTri);
200
- if (hasTri)
201
- {
202
- S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
203
- S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
204
- S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
205
- S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
206
- S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
207
- S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
208
- int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
209
- lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
210
- loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
211
- hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
212
- hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
213
-
214
- U32 bit = 1 << threadIdx.x;
215
- #if __CUDA_ARCH__ >= 700
216
- bool multi = (hix != lox || hiy != loy);
217
- if (!__any_sync(hasTriMask, multi))
218
- {
219
- int binIdx = lox + p.widthBins * loy;
220
- U32 mask = __match_any_sync(hasTriMask, binIdx);
221
- s_outMask[threadIdx.y][binIdx] = mask;
222
- __syncwarp(hasTriMask);
223
- } else
224
- #endif
225
- {
226
- bool complex = (hix > lox+1 || hiy > loy+1);
227
- if (!__any_sync(hasTriMask, complex))
228
- {
229
- int binIdx = lox + p.widthBins * loy;
230
- atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
231
- if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
232
- if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
233
- if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
234
- } else
235
- {
236
- S32 d12x = d02x - d01x, d12y = d02y - d01y;
237
- v0x -= lox << binLog, v0y -= loy << binLog;
238
-
239
- S32 t01 = v0x * d01y - v0y * d01x;
240
- S32 t02 = v0y * d02x - v0x * d02y;
241
- S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
242
- S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
243
- S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
244
- S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
245
-
246
- int width = hix - lox + 1;
247
- d01x += width * d01y;
248
- d02x += width * d02y;
249
- d12x += width * d12y;
250
-
251
- U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
252
- U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
253
- U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
254
- int stride = p.widthBins * 4;
255
- int ptrYInc = stride - width * 4;
256
-
257
- do
258
- {
259
- if (b01 >= 0 && b02 >= 0 && b12 >= 0)
260
- atomicOr((U32*)currPtr, bit);
261
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
262
- if (currPtr == skipPtr)
263
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
264
- }
265
- while (currPtr != endPtr);
266
- }
267
- }
268
- }
269
-
270
- // count per-bin contributions
271
- if (thrInBlock == 0)
272
- s_overTotal = 0; // overflow counter
273
-
274
- // ensure that out masks are done
275
- __syncthreads();
276
-
277
- int overIndex = -1;
278
- bool act = (thrInBlock < p.numBins);
279
- U32 actMask = __ballot_sync(~0u, act);
280
- if (act)
281
- {
282
- U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
283
- U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
284
- int total = 0;
285
- for (int i = 0; i < CR_BIN_WARPS; i++)
286
- {
287
- total += __popc(*(U32*)srcPtr);
288
- *(U32*)dstPtr = total;
289
- srcPtr += (CR_MAXBINS_SQR + 1) * 4;
290
- dstPtr += (CR_MAXBINS_SQR + 1) * 4;
291
- }
292
-
293
- // overflow => request a new segment
294
- int ofs = s_outOfs[thrInBlock];
295
- bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
296
- U32 ovrMask = __ballot_sync(actMask, ovr);
297
- if (ovr)
298
- {
299
- overIndex = __popc(ovrMask & getLaneMaskLt());
300
- if (overIndex == 0)
301
- s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
302
- __syncwarp(ovrMask);
303
- overIndex += s_broadcast[threadIdx.y + 16];
304
- s_overIndex[thrInBlock] = overIndex;
305
- }
306
- }
307
-
308
- // sync after overTotal is ready
309
- __syncthreads();
310
-
311
- // at least one segment overflowed => allocate segments
312
- U32 overTotal = s_overTotal;
313
- U32 allocBase = 0;
314
- if (overTotal > 0)
315
- {
316
- // allocate memory
317
- if (thrInBlock == 0)
318
- {
319
- U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
320
- s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
321
- }
322
- __syncthreads();
323
- allocBase = s_allocBase;
324
-
325
- // did my bin overflow?
326
- if (overIndex != -1)
327
- {
328
- // calculate new segment index
329
- int segIdx = allocBase + overIndex;
330
-
331
- // add to linked list
332
- if (s_outOfs[thrInBlock] < 0)
333
- binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
334
- else
335
- binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
336
-
337
- // defaults
338
- binSegNext [segIdx] = -1;
339
- binSegCount[segIdx] = CR_BIN_SEG_SIZE;
340
- }
341
- }
342
-
343
- // concurrent emission -- each warp handles its own triangle
344
- if (thrInBlock < bufCount)
345
- {
346
- int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
347
- int currBin = lox + loy * p.widthBins;
348
- int skipBin = (hix + 1) + loy * p.widthBins;
349
- int endBin = lox + (hiy + 1) * p.widthBins;
350
- int binYInc = p.widthBins - (hix - lox + 1);
351
-
352
- // loop over triangle's bins
353
- do
354
- {
355
- U32 outMask = s_outMask[threadIdx.y][currBin];
356
- if (outMask & (1<<threadIdx.x))
357
- {
358
- int idx = __popc(outMask & getLaneMaskLt());
359
- if (threadIdx.y > 0)
360
- idx += s_outCount[threadIdx.y-1][currBin];
361
-
362
- int base = s_outOfs[currBin];
363
- int free = (-base) & (CR_BIN_SEG_SIZE - 1);
364
- if (idx >= free)
365
- idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
366
- else
367
- idx += base;
368
-
369
- binSegData[idx] = s_triBuf[triPos];
370
- }
371
-
372
- currBin++;
373
- if (currBin == skipBin)
374
- currBin += binYInc, skipBin += p.widthBins;
375
- }
376
- while (currBin != endBin);
377
- }
378
-
379
- // wait all triangles to finish, then replace overflown segment offsets
380
- __syncthreads();
381
- if (thrInBlock < p.numBins)
382
- {
383
- U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
384
- U32 oldOfs = s_outOfs[thrInBlock];
385
- if (overIndex == -1)
386
- s_outOfs[thrInBlock] = oldOfs + total;
387
- else
388
- {
389
- int addr = oldOfs + total;
390
- addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
391
- addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
392
- s_outOfs[thrInBlock] = addr;
393
- }
394
- s_outTotal[thrInBlock] += total;
395
- }
396
-
397
- // these triangles are now done
398
- int count = ::min(bufCount, CR_BIN_WARPS * 32);
399
- bufCount -= count;
400
- bufIndex += count;
401
- bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
402
- }
403
- while (bufCount > 0 || batchPos < batchEnd);
404
-
405
- // flush all bins
406
- if (thrInBlock < p.numBins)
407
- {
408
- int ofs = s_outOfs[thrInBlock];
409
- if (ofs & (CR_BIN_SEG_SIZE-1))
410
- {
411
- int seg = ofs >> CR_BIN_SEG_LOG2;
412
- binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
413
- s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
414
- }
415
- }
416
- }
417
-
418
- // output totals
419
- if (thrInBlock < p.numBins)
420
- binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
421
- }
422
-
423
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp DELETED
@@ -1,94 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "../../framework.h"
10
- #include "Buffer.hpp"
11
-
12
- using namespace CR;
13
-
14
- //------------------------------------------------------------------------
15
- // GPU buffer.
16
- //------------------------------------------------------------------------
17
-
18
- Buffer::Buffer(void)
19
- : m_gpuPtr(NULL),
20
- m_bytes (0)
21
- {
22
- // empty
23
- }
24
-
25
- Buffer::~Buffer(void)
26
- {
27
- if (m_gpuPtr)
28
- cudaFree(m_gpuPtr); // Don't throw an exception.
29
- }
30
-
31
- void Buffer::reset(size_t bytes)
32
- {
33
- if (bytes == m_bytes)
34
- return;
35
-
36
- if (m_gpuPtr)
37
- {
38
- NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
39
- m_gpuPtr = NULL;
40
- }
41
-
42
- if (bytes > 0)
43
- NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
44
-
45
- m_bytes = bytes;
46
- }
47
-
48
- void Buffer::grow(size_t bytes)
49
- {
50
- if (bytes > m_bytes)
51
- reset(bytes);
52
- }
53
-
54
- //------------------------------------------------------------------------
55
- // Host buffer with page-locked memory.
56
- //------------------------------------------------------------------------
57
-
58
- HostBuffer::HostBuffer(void)
59
- : m_hostPtr(NULL),
60
- m_bytes (0)
61
- {
62
- // empty
63
- }
64
-
65
- HostBuffer::~HostBuffer(void)
66
- {
67
- if (m_hostPtr)
68
- cudaFreeHost(m_hostPtr); // Don't throw an exception.
69
- }
70
-
71
- void HostBuffer::reset(size_t bytes)
72
- {
73
- if (bytes == m_bytes)
74
- return;
75
-
76
- if (m_hostPtr)
77
- {
78
- NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
79
- m_hostPtr = NULL;
80
- }
81
-
82
- if (bytes > 0)
83
- NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
84
-
85
- m_bytes = bytes;
86
- }
87
-
88
- void HostBuffer::grow(size_t bytes)
89
- {
90
- if (bytes > m_bytes)
91
- reset(bytes);
92
- }
93
-
94
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp DELETED
@@ -1,55 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include "Defs.hpp"
11
-
12
- namespace CR
13
- {
14
- //------------------------------------------------------------------------
15
-
16
- class Buffer
17
- {
18
- public:
19
- Buffer (void);
20
- ~Buffer (void);
21
-
22
- void reset (size_t bytes);
23
- void grow (size_t bytes);
24
- void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
25
- size_t getSize (void) const { return m_bytes; }
26
-
27
- void setPtr (void* ptr) { m_gpuPtr = ptr; }
28
-
29
- private:
30
- void* m_gpuPtr;
31
- size_t m_bytes;
32
- };
33
-
34
- //------------------------------------------------------------------------
35
-
36
- class HostBuffer
37
- {
38
- public:
39
- HostBuffer (void);
40
- ~HostBuffer (void);
41
-
42
- void reset (size_t bytes);
43
- void grow (size_t bytes);
44
- void* getPtr (void) { return m_hostPtr; }
45
- size_t getSize (void) const { return m_bytes; }
46
-
47
- void setPtr (void* ptr) { m_hostPtr = ptr; }
48
-
49
- private:
50
- void* m_hostPtr;
51
- size_t m_bytes;
52
- };
53
-
54
- //------------------------------------------------------------------------
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl DELETED
@@ -1,730 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
-
11
- __device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
12
- {
13
- int tileX = tileInBin & (CR_BIN_SIZE - 1);
14
- int tileY = tileInBin >> CR_BIN_LOG2;
15
- return tileX + tileY * widthTiles;
16
- }
17
-
18
- //------------------------------------------------------------------------
19
-
20
- __device__ __inline__ void coarseRasterImpl(const CRParams p)
21
- {
22
- // Common.
23
-
24
- __shared__ volatile U32 s_workCounter;
25
- __shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
26
-
27
- // Input.
28
-
29
- __shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
30
- __shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
31
- __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
32
- __shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
33
- __shared__ volatile S32 s_triQueueWritePos;
34
- __shared__ volatile U32 s_binStreamSelectedOfs;
35
- __shared__ volatile U32 s_binStreamSelectedSize;
36
-
37
- // Output.
38
-
39
- __shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
40
- __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
41
- __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
42
- __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
43
- __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
44
- __shared__ volatile U32 s_firstAllocSeg;
45
- __shared__ volatile U32 s_firstActiveIdx;
46
-
47
- // Pointers and constants.
48
-
49
- CRAtomics& atomics = p.atomics[blockIdx.z];
50
- const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
51
- const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
52
- const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
53
- const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
54
- const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
55
- const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
56
- S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
57
- S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
58
- S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
59
- S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
60
- S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
61
-
62
- int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
63
- int thrInBlock = threadIdx.x + threadIdx.y * 32;
64
- int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
65
-
66
- if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
67
- return;
68
-
69
- // Initialize sharedmem arrays.
70
-
71
- if (thrInBlock == 0)
72
- {
73
- s_tileEmitPrefixSum[0] = 0;
74
- s_tileAllocPrefixSum[0] = 0;
75
- }
76
- s_scanTemp[threadIdx.y][threadIdx.x] = 0;
77
-
78
- // Sort bins in descending order of triangle count.
79
-
80
- for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
81
- {
82
- int count = 0;
83
- for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
84
- count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
85
- s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
86
- }
87
-
88
- __syncthreads();
89
- sortShared(s_binOrder, p.numBins);
90
-
91
- // Process each bin by one block.
92
-
93
- for (;;)
94
- {
95
- // Pick a bin for the block.
96
-
97
- if (thrInBlock == 0)
98
- s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
99
- __syncthreads();
100
-
101
- int workCounter = s_workCounter;
102
- if (workCounter >= p.numBins)
103
- break;
104
-
105
- U32 binOrder = s_binOrder[workCounter];
106
- bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
107
- if (binEmpty && !p.deferredClear)
108
- break;
109
-
110
- int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
111
-
112
- // Initialize input/output streams.
113
-
114
- int triQueueWritePos = 0;
115
- int triQueueReadPos = 0;
116
-
117
- if (thrInBlock < CR_BIN_STREAMS_SIZE)
118
- {
119
- int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
120
- s_binStreamCurrSeg[thrInBlock] = segIdx;
121
- s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
122
- }
123
-
124
- for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
125
- s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
126
-
127
- // Initialize per-bin state.
128
-
129
- int binY = idiv_fast(binIdx, p.widthBins);
130
- int binX = binIdx - binY * p.widthBins;
131
- int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
132
- int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
133
- int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
134
- int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
135
- int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
136
-
137
- // Entire block: Merge input streams and process triangles.
138
-
139
- if (!binEmpty)
140
- do
141
- {
142
- //------------------------------------------------------------------------
143
- // Merge.
144
- //------------------------------------------------------------------------
145
-
146
- // Entire block: Not enough triangles => merge and queue segments.
147
- // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
148
-
149
- while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
150
- {
151
- // First warp: Choose the segment with the lowest initial triangle index.
152
-
153
- bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
154
- U32 hasStreamMask = __ballot_sync(~0u, hasStream);
155
- if (hasStream)
156
- {
157
- // Find the stream with the lowest triangle index.
158
-
159
- U32 firstTri = s_binStreamFirstTri[thrInBlock];
160
- U32 t = firstTri;
161
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
162
-
163
- #if (CR_BIN_STREAMS_SIZE > 1)
164
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
165
- #endif
166
- #if (CR_BIN_STREAMS_SIZE > 2)
167
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
168
- #endif
169
- #if (CR_BIN_STREAMS_SIZE > 4)
170
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
171
- #endif
172
- #if (CR_BIN_STREAMS_SIZE > 8)
173
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
174
- #endif
175
- #if (CR_BIN_STREAMS_SIZE > 16)
176
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
177
- #endif
178
- v[0] = t; __syncwarp(hasStreamMask);
179
-
180
- // Consume and broadcast.
181
-
182
- bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
183
- U32 firstMask = __ballot_sync(hasStreamMask, first);
184
- if (first && (firstMask >> threadIdx.x) == 1u)
185
- {
186
- int segIdx = s_binStreamCurrSeg[thrInBlock];
187
- s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
188
- if (segIdx != -1)
189
- {
190
- int segSize = binSegCount[segIdx];
191
- int segNext = binSegNext[segIdx];
192
- s_binStreamSelectedSize = segSize;
193
- s_triQueueWritePos = triQueueWritePos + segSize;
194
- s_binStreamCurrSeg[thrInBlock] = segNext;
195
- s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
196
- }
197
- }
198
- }
199
-
200
- // No more segments => break.
201
-
202
- __syncthreads();
203
- triQueueWritePos = s_triQueueWritePos;
204
- int segOfs = s_binStreamSelectedOfs;
205
- if (segOfs < 0)
206
- break;
207
-
208
- int segSize = s_binStreamSelectedSize;
209
- __syncthreads();
210
-
211
- // Fetch triangles into the queue.
212
-
213
- for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
214
- {
215
- S32 triIdx = binSegData[segOfs + idxInSeg];
216
- s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
217
- }
218
- }
219
-
220
- // All threads: Clear emit masks.
221
-
222
- for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
223
- s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
224
-
225
- __syncthreads();
226
-
227
- //------------------------------------------------------------------------
228
- // Raster.
229
- //------------------------------------------------------------------------
230
-
231
- // Triangle per thread: Read from the queue.
232
-
233
- int triIdx = -1;
234
- if (triQueueReadPos + thrInBlock < triQueueWritePos)
235
- triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
236
-
237
- uint4 triData = make_uint4(0, 0, 0, 0);
238
- if (triIdx != -1)
239
- {
240
- int dataIdx = triIdx >> 3;
241
- int subtriIdx = triIdx & 7;
242
- if (subtriIdx != 7)
243
- dataIdx = triHeader[dataIdx].misc + subtriIdx;
244
- triData = *((uint4*)triHeader + dataIdx);
245
- }
246
-
247
- // 32 triangles per warp: Record emits (= tile intersections).
248
-
249
- if (__any_sync(~0u, triIdx != -1))
250
- {
251
- S32 v0x = sub_s16lo_s16lo(triData.x, originX);
252
- S32 v0y = sub_s16hi_s16lo(triData.x, originY);
253
- S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
254
- S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
255
- S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
256
- S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
257
-
258
- // Compute tile-based AABB.
259
-
260
- int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
261
- int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
262
- int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
263
- int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
264
- int sizex = add_sub(hix, 1, lox);
265
- int sizey = add_sub(hiy, 1, loy);
266
- int area = sizex * sizey;
267
-
268
- // Miscellaneous init.
269
-
270
- U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
271
- int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
272
- U32 maskBit = 1 << threadIdx.x;
273
-
274
- // Case A: All AABBs are small => record the full AABB using atomics.
275
-
276
- if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
277
- {
278
- if (triIdx != -1)
279
- {
280
- atomicOr((U32*)currPtr, maskBit);
281
- if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
282
- if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
283
- if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
284
- }
285
- }
286
- else
287
- {
288
- // Compute warp-AABB (scan-32).
289
-
290
- U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
291
- if (triIdx == -1)
292
- aabbMask = 0;
293
-
294
- volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
295
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
296
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
297
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
298
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
299
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
300
- v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
301
-
302
- U32 maskX = aabbMask & 0xFFFF;
303
- U32 maskY = aabbMask >> 16;
304
- int wlox = findLeadingOne(maskX ^ (maskX - 1));
305
- int wloy = findLeadingOne(maskY ^ (maskY - 1));
306
- int whix = findLeadingOne(maskX);
307
- int whiy = findLeadingOne(maskY);
308
- int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
309
-
310
- // Initialize edge functions.
311
-
312
- S32 d12x = d02x - d01x;
313
- S32 d12y = d02y - d01y;
314
- v0x -= lox << tileLog;
315
- v0y -= loy << tileLog;
316
-
317
- S32 t01 = v0x * d01y - v0y * d01x;
318
- S32 t02 = v0y * d02x - v0x * d02y;
319
- S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
320
- S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
321
- S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
322
- S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
323
-
324
- d01x += sizex * d01y;
325
- d02x += sizex * d02y;
326
- d12x += sizex * d12y;
327
-
328
- // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
329
- if (__any_sync(~0u, warea * 4 <= area * 8))
330
- {
331
- // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
332
- bool act = (triIdx != -1);
333
- U32 actMask = __ballot_sync(~0u, act);
334
- if (act)
335
- {
336
- for (int y = wloy; y <= whiy; y++)
337
- {
338
- bool yIn = (y >= loy && y <= hiy);
339
- U32 yMask = __ballot_sync(actMask, yIn);
340
- if (yIn)
341
- {
342
- for (int x = wlox; x <= whix; x++)
343
- {
344
- bool xyIn = (x >= lox && x <= hix);
345
- U32 xyMask = __ballot_sync(yMask, xyIn);
346
- if (xyIn)
347
- {
348
- U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
349
- if (threadIdx.x == 31 - __clz(xyMask))
350
- *(U32*)currPtr = res;
351
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
352
- }
353
- }
354
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
355
- }
356
- }
357
- }
358
- }
359
-
360
- // Case C: General case => Check tiles in AABB, record using atomics.
361
-
362
- else
363
- {
364
- if (triIdx != -1)
365
- {
366
- U8* skipPtr = currPtr + (sizex << 2);
367
- U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
368
- do
369
- {
370
- if (b01 >= 0 && b02 >= 0 && b12 >= 0)
371
- atomicOr((U32*)currPtr, maskBit);
372
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
373
- if (currPtr == skipPtr)
374
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
375
- }
376
- while (currPtr != endPtr);
377
- }
378
- }
379
- }
380
- }
381
-
382
- __syncthreads();
383
-
384
- //------------------------------------------------------------------------
385
- // Count.
386
- //------------------------------------------------------------------------
387
-
388
- // Tile per thread: Initialize prefix sums.
389
-
390
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
391
- {
392
- int tileInBin = tileInBin_base + thrInBlock;
393
- bool act = (tileInBin < CR_BIN_SQR);
394
- U32 actMask = __ballot_sync(~0u, act);
395
- if (act)
396
- {
397
- // Compute prefix sum of emits over warps.
398
-
399
- U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
400
- U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
401
- int tileEmits = 0;
402
- for (int i = 0; i < CR_COARSE_WARPS; i++)
403
- {
404
- tileEmits += __popc(*(U32*)srcPtr);
405
- *(U32*)dstPtr = tileEmits;
406
- srcPtr += (CR_BIN_SQR + 1) * 4;
407
- dstPtr += (CR_BIN_SQR + 1) * 4;
408
- }
409
-
410
- // Determine the number of segments to allocate.
411
-
412
- int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
413
- int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
414
- volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
415
-
416
- // All counters within the warp are small => compute prefix sum using ballot.
417
-
418
- if (!__any_sync(actMask, tileEmits >= 2))
419
- {
420
- U32 m = getLaneMaskLe();
421
- *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
422
- }
423
-
424
- // Otherwise => scan-32 within the warp.
425
-
426
- else
427
- {
428
- U32 sum = (tileEmits << emitShift) | tileAllocs;
429
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
430
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
431
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
432
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
433
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
434
- *v = sum; __syncwarp(actMask);
435
- }
436
- }
437
- }
438
-
439
- // First warp: Scan-8.
440
-
441
- __syncthreads();
442
-
443
- bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
444
- U32 scan8Mask = __ballot_sync(~0u, scan8);
445
- if (scan8)
446
- {
447
- int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
448
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
449
- v[0] = sum; __syncwarp(scan8Mask);
450
- #if (CR_BIN_SQR > 1 * 32)
451
- sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
452
- #endif
453
- #if (CR_BIN_SQR > 2 * 32)
454
- sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
455
- #endif
456
- #if (CR_BIN_SQR > 4 * 32)
457
- sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
458
- #endif
459
- }
460
-
461
- __syncthreads();
462
-
463
- // Tile per thread: Finalize prefix sums.
464
- // Single thread: Allocate segments.
465
-
466
- for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
467
- {
468
- int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
469
- int numEmits = sum >> emitShift;
470
- int numAllocs = sum & ((1 << emitShift) - 1);
471
- s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
472
- s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
473
-
474
- if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
475
- {
476
- int t = atomicAdd(&atomics.numTileSegs, numAllocs);
477
- s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
478
- }
479
- }
480
-
481
- __syncthreads();
482
- int firstAllocSeg = s_firstAllocSeg;
483
- int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
484
- int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
485
-
486
- //------------------------------------------------------------------------
487
- // Emit.
488
- //------------------------------------------------------------------------
489
-
490
- // Emit per thread: Write triangle index to globalmem.
491
-
492
- for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
493
- {
494
- // Find tile in bin.
495
-
496
- U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
497
- U8* tilePtr = tileBase;
498
- U8* ptr;
499
-
500
- #if (CR_BIN_SQR > 128)
501
- ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
502
- #endif
503
- #if (CR_BIN_SQR > 64)
504
- ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
505
- #endif
506
- #if (CR_BIN_SQR > 32)
507
- ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
508
- #endif
509
- #if (CR_BIN_SQR > 16)
510
- ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
511
- #endif
512
- #if (CR_BIN_SQR > 8)
513
- ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
514
- #endif
515
- #if (CR_BIN_SQR > 4)
516
- ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
517
- #endif
518
- #if (CR_BIN_SQR > 2)
519
- ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
520
- #endif
521
- #if (CR_BIN_SQR > 1)
522
- ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
523
- #endif
524
-
525
- int tileInBin = (tilePtr - tileBase) >> 2;
526
- int emitInTile = emitInBin - *(U32*)tilePtr;
527
-
528
- // Find warp in tile.
529
-
530
- int warpStep = (CR_BIN_SQR + 1) * 4;
531
- U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
532
- U8* warpPtr = warpBase;
533
-
534
- #if (CR_COARSE_WARPS > 8)
535
- ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
536
- #endif
537
- #if (CR_COARSE_WARPS > 4)
538
- ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
539
- #endif
540
- #if (CR_COARSE_WARPS > 2)
541
- ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
542
- #endif
543
- #if (CR_COARSE_WARPS > 1)
544
- ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
545
- #endif
546
-
547
- int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
548
- U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
549
- int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
550
-
551
- // Find thread in warp.
552
-
553
- int threadInWarp = 0;
554
- int pop = __popc(emitMask & 0xFFFF);
555
- bool pred = (emitInWarp >= pop);
556
- if (pred) emitInWarp -= pop;
557
- if (pred) emitMask >>= 0x10;
558
- if (pred) threadInWarp += 0x10;
559
-
560
- pop = __popc(emitMask & 0xFF);
561
- pred = (emitInWarp >= pop);
562
- if (pred) emitInWarp -= pop;
563
- if (pred) emitMask >>= 0x08;
564
- if (pred) threadInWarp += 0x08;
565
-
566
- pop = __popc(emitMask & 0xF);
567
- pred = (emitInWarp >= pop);
568
- if (pred) emitInWarp -= pop;
569
- if (pred) emitMask >>= 0x04;
570
- if (pred) threadInWarp += 0x04;
571
-
572
- pop = __popc(emitMask & 0x3);
573
- pred = (emitInWarp >= pop);
574
- if (pred) emitInWarp -= pop;
575
- if (pred) emitMask >>= 0x02;
576
- if (pred) threadInWarp += 0x02;
577
-
578
- if (emitInWarp >= (emitMask & 1))
579
- threadInWarp++;
580
-
581
- // Figure out where to write.
582
-
583
- int currOfs = s_tileStreamCurrOfs[tileInBin];
584
- int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
585
- int outOfs = emitInTile;
586
-
587
- if (outOfs < spaceLeft)
588
- outOfs += currOfs;
589
- else
590
- {
591
- int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
592
- outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
593
- }
594
-
595
- // Write.
596
-
597
- int queueIdx = warpInTile * 32 + threadInWarp;
598
- int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
599
-
600
- tileSegData[outOfs] = triIdx;
601
- }
602
-
603
- //------------------------------------------------------------------------
604
- // Patch.
605
- //------------------------------------------------------------------------
606
-
607
- // Allocated segment per thread: Initialize next-pointer and count.
608
-
609
- for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
610
- {
611
- int segIdx = firstAllocSeg + i;
612
- tileSegNext[segIdx] = segIdx + 1;
613
- tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
614
- }
615
-
616
- // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
617
-
618
- __syncthreads();
619
- for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
620
- {
621
- int oldOfs = s_tileStreamCurrOfs[tileInBin];
622
- int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
623
- int allocLo = s_tileAllocPrefixSum[tileInBin];
624
- int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
625
-
626
- if (allocLo != allocHi)
627
- {
628
- S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
629
- if (oldOfs < 0)
630
- nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
631
- *nextPtr = firstAllocSeg + allocLo;
632
-
633
- newOfs--;
634
- newOfs &= CR_TILE_SEG_SIZE - 1;
635
- newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
636
- newOfs++;
637
- }
638
- s_tileStreamCurrOfs[tileInBin] = newOfs;
639
- }
640
-
641
- // Advance queue read pointer.
642
- // Queue became empty => bin done.
643
-
644
- triQueueReadPos += CR_COARSE_WARPS * 32;
645
- }
646
- while (triQueueReadPos < triQueueWritePos);
647
-
648
- // Tile per thread: Fix next-pointer and count of the last segment.
649
- // 32 tiles per warp: Count active tiles.
650
-
651
- __syncthreads();
652
-
653
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
654
- {
655
- int tileInBin = tileInBin_base + thrInBlock;
656
- bool act = (tileInBin < CR_BIN_SQR);
657
- U32 actMask = __ballot_sync(~0u, act);
658
- if (act)
659
- {
660
- int tileX = tileInBin & (CR_BIN_SIZE - 1);
661
- int tileY = tileInBin >> CR_BIN_LOG2;
662
- bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
663
-
664
- int ofs = s_tileStreamCurrOfs[tileInBin];
665
- int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
666
- int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
667
-
668
- if (ofs >= 0)
669
- tileSegNext[segIdx] = -1;
670
- else if (force)
671
- {
672
- s_tileStreamCurrOfs[tileInBin] = 0;
673
- tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
674
- }
675
-
676
- if (segCount != 0)
677
- tileSegCount[segIdx] = segCount;
678
-
679
- U32 res = __ballot_sync(actMask, ofs >= 0 | force);
680
- if (threadIdx.x == 0)
681
- s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
682
- }
683
- }
684
-
685
- // First warp: Scan-8.
686
- // One thread: Allocate space for active tiles.
687
-
688
- __syncthreads();
689
-
690
- bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
691
- U32 scan8Mask = __ballot_sync(~0u, scan8);
692
- if (scan8)
693
- {
694
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
695
- U32 sum = v[0];
696
- #if (CR_BIN_SQR > 1 * 32)
697
- sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
698
- #endif
699
- #if (CR_BIN_SQR > 2 * 32)
700
- sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
701
- #endif
702
- #if (CR_BIN_SQR > 4 * 32)
703
- sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
704
- #endif
705
-
706
- if (thrInBlock == CR_BIN_SQR / 32 - 1)
707
- s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
708
- }
709
-
710
- // Tile per thread: Output active tiles.
711
-
712
- __syncthreads();
713
-
714
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
715
- {
716
- int tileInBin = tileInBin_base + thrInBlock;
717
- bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
718
- U32 actMask = __ballot_sync(~0u, act);
719
- if (act)
720
- {
721
- int activeIdx = s_firstActiveIdx;
722
- activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
723
- activeIdx += __popc(actMask & getLaneMaskLt());
724
- activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
725
- }
726
- }
727
- }
728
- }
729
-
730
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp DELETED
@@ -1,73 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
-
11
- //------------------------------------------------------------------------
12
-
13
- #define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
14
- #define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
15
-
16
- #define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
17
- #define CR_BIN_LOG2 4 // BinSize / TileSize.
18
- #define CR_TILE_LOG2 3 // TileSize / PixelSize.
19
-
20
- #define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
21
- #define CR_FLIPBIT_FLIP_Y 2
22
- #define CR_FLIPBIT_FLIP_X 3
23
- #define CR_FLIPBIT_SWAP_XY 4
24
- #define CR_FLIPBIT_COMPL 5
25
-
26
- #define CR_BIN_STREAMS_LOG2 4
27
- #define CR_BIN_SEG_LOG2 9 // 32-bit entries.
28
- #define CR_TILE_SEG_LOG2 5 // 32-bit entries.
29
-
30
- #define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
31
- #define CR_COARSE_QUEUE_LOG2 10 // Triangles.
32
-
33
- #define CR_SETUP_WARPS 2
34
- #define CR_SETUP_OPT_BLOCKS 8
35
- #define CR_BIN_WARPS 16
36
- #define CR_COARSE_WARPS 16 // Must be a power of two.
37
- #define CR_FINE_MAX_WARPS 20
38
-
39
- #define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
40
-
41
- //------------------------------------------------------------------------
42
-
43
- #define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
44
- #define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
45
- #define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
46
-
47
- #define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
48
- #define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
49
- #define CR_BIN_SIZE (1 << CR_BIN_LOG2)
50
- #define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
51
-
52
- #define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
53
- #define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
54
- #define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
55
- #define CR_TILE_SIZE (1 << CR_TILE_LOG2)
56
- #define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
57
-
58
- #define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
59
- #define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
60
- #define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
61
-
62
- #define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
63
- #define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
64
-
65
- //------------------------------------------------------------------------
66
- // When evaluating interpolated Z pixel centers, we may introduce an error
67
- // of (+-CR_LERP_ERROR) ULPs.
68
-
69
- #define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
70
- #define CR_DEPTH_MIN CR_LERP_ERROR(3)
71
- #define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
72
-
73
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp DELETED
@@ -1,79 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "Defs.hpp"
10
- #include "../CudaRaster.hpp"
11
- #include "RasterImpl.hpp"
12
-
13
- using namespace CR;
14
-
15
- //------------------------------------------------------------------------
16
- // Stub interface implementation.
17
- //------------------------------------------------------------------------
18
-
19
- CudaRaster::CudaRaster()
20
- {
21
- m_impl = new RasterImpl();
22
- }
23
-
24
- CudaRaster::~CudaRaster()
25
- {
26
- delete m_impl;
27
- }
28
-
29
- void CudaRaster::setBufferSize(int width, int height, int numImages)
30
- {
31
- m_impl->setBufferSize(Vec3i(width, height, numImages));
32
- }
33
-
34
- void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
35
- {
36
- m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
37
- }
38
-
39
- void CudaRaster::setRenderModeFlags(U32 flags)
40
- {
41
- m_impl->setRenderModeFlags(flags);
42
- }
43
-
44
- void CudaRaster::deferredClear(U32 clearColor)
45
- {
46
- m_impl->deferredClear(clearColor);
47
- }
48
-
49
- void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
50
- {
51
- m_impl->setVertexBuffer(vertices, numVertices);
52
- }
53
-
54
- void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
55
- {
56
- m_impl->setIndexBuffer(indices, numTriangles);
57
- }
58
-
59
- bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
60
- {
61
- return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
62
- }
63
-
64
- void* CudaRaster::getColorBuffer(void)
65
- {
66
- return m_impl->getColorBuffer();
67
- }
68
-
69
- void* CudaRaster::getDepthBuffer(void)
70
- {
71
- return m_impl->getDepthBuffer();
72
- }
73
-
74
- void CudaRaster::swapDepthAndPeel(void)
75
- {
76
- m_impl->swapDepthAndPeel();
77
- }
78
-
79
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp DELETED
@@ -1,90 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include <cuda_runtime.h>
11
- #include <cstdint>
12
-
13
- namespace CR
14
- {
15
- //------------------------------------------------------------------------
16
-
17
- #ifndef NULL
18
- # define NULL 0
19
- #endif
20
-
21
- #ifdef __CUDACC__
22
- # define CR_CUDA 1
23
- #else
24
- # define CR_CUDA 0
25
- #endif
26
-
27
- #if CR_CUDA
28
- # define CR_CUDA_FUNC __device__ __inline__
29
- # define CR_CUDA_CONST __constant__
30
- #else
31
- # define CR_CUDA_FUNC inline
32
- # define CR_CUDA_CONST static const
33
- #endif
34
-
35
- #define CR_UNREF(X) ((void)(X))
36
- #define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
37
-
38
- //------------------------------------------------------------------------
39
-
40
- typedef uint8_t U8;
41
- typedef uint16_t U16;
42
- typedef uint32_t U32;
43
- typedef uint64_t U64;
44
- typedef int8_t S8;
45
- typedef int16_t S16;
46
- typedef int32_t S32;
47
- typedef int64_t S64;
48
- typedef float F32;
49
- typedef double F64;
50
- typedef void (*FuncPtr)(void);
51
-
52
- //------------------------------------------------------------------------
53
-
54
- #define CR_U32_MAX (0xFFFFFFFFu)
55
- #define CR_S32_MIN (~0x7FFFFFFF)
56
- #define CR_S32_MAX (0x7FFFFFFF)
57
- #define CR_U64_MAX ((U64)(S64)-1)
58
- #define CR_S64_MIN ((S64)-1 << 63)
59
- #define CR_S64_MAX (~((S64)-1 << 63))
60
- #define CR_F32_MIN (1.175494351e-38f)
61
- #define CR_F32_MAX (3.402823466e+38f)
62
- #define CR_F64_MIN (2.2250738585072014e-308)
63
- #define CR_F64_MAX (1.7976931348623158e+308)
64
-
65
- //------------------------------------------------------------------------
66
- // Misc types.
67
-
68
- class Vec2i
69
- {
70
- public:
71
- Vec2i(int x_, int y_) : x(x_), y(y_) {}
72
- int x, y;
73
- };
74
-
75
- class Vec3i
76
- {
77
- public:
78
- Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
79
- int x, y, z;
80
- };
81
-
82
- //------------------------------------------------------------------------
83
- // CUDA utilities.
84
-
85
- #if CR_CUDA
86
- # define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
87
- #endif
88
-
89
- //------------------------------------------------------------------------
90
- } // namespace CR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl DELETED
@@ -1,385 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
- // Utility funcs.
11
- //------------------------------------------------------------------------
12
-
13
- __device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
14
- {
15
- tileZMax = CR_DEPTH_MAX;
16
- tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
17
- }
18
-
19
- __device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
20
- {
21
- // Entry is warp-coherent.
22
- if (__any_sync(~0u, tileZUpd))
23
- {
24
- U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
25
- temp[threadIdx.x + 16] = z; __syncwarp();
26
- z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
27
- z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
28
- z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
29
- z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
30
- z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
31
- tileZMax = temp[47];
32
- tileZUpd = false;
33
- }
34
- }
35
-
36
- //------------------------------------------------------------------------
37
-
38
- __device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
39
- {
40
- const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
41
- const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
42
- const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
43
- const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
44
-
45
- if (threadIdx.x >= tileSegCount[segment])
46
- {
47
- triIdx = -1;
48
- dataIdx = -1;
49
- }
50
- else
51
- {
52
- int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
53
- triIdx = subtriIdx >> 3;
54
- dataIdx = triIdx;
55
- subtriIdx &= 7;
56
- if (subtriIdx != 7)
57
- dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
58
- triHeader = *((uint4*)triHeaderPtr + dataIdx);
59
- }
60
-
61
- // advance to next segment
62
- segment = tileSegNext[segment];
63
- }
64
-
65
- //------------------------------------------------------------------------
66
-
67
- __device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
68
- {
69
- U32 zmin = triHeader.w & 0xFFFFF000;
70
- return (zmin > tileZMax);
71
- }
72
-
73
- //------------------------------------------------------------------------
74
-
75
- __device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
76
- {
77
- int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
78
- int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
79
-
80
- // extract S16 vertex positions while subtracting tile coordinates
81
- S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
82
- S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
83
- S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
84
- S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
85
- S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
86
- S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
87
-
88
- // extract flipbits
89
- U32 f01 = (triHeader.w >> 6) & 0x3C;
90
- U32 f12 = (triHeader.w >> 2) & 0x3C;
91
- U32 f20 = (triHeader.w << 2) & 0x3C;
92
-
93
- // compute per-edge coverage masks
94
- U64 c01, c12, c20;
95
- c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
96
- c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
97
- c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
98
-
99
- // combine masks
100
- return c01 & c12 & c20;
101
- }
102
-
103
- //------------------------------------------------------------------------
104
-
105
- __device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
106
- {
107
- __syncwarp();
108
- temp[threadIdx.x + 16] = value; __syncwarp();
109
- value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
110
- value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
111
- value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
112
- value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
113
- value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
114
- return value;
115
- }
116
-
117
- __device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
118
- {
119
- return temp[47];
120
- }
121
-
122
- //------------------------------------------------------------------------
123
-
124
- __device__ __inline__ S32 findBit(U64 mask, int idx)
125
- {
126
- U32 x = getLo(mask);
127
- int pop = __popc(x);
128
- bool p = (pop <= idx);
129
- if (p) x = getHi(mask);
130
- if (p) idx -= pop;
131
- int bit = p ? 32 : 0;
132
-
133
- pop = __popc(x & 0x0000ffffu);
134
- p = (pop <= idx);
135
- if (p) x >>= 16;
136
- if (p) bit += 16;
137
- if (p) idx -= pop;
138
-
139
- U32 tmp = x & 0x000000ffu;
140
- pop = __popc(tmp);
141
- p = (pop <= idx);
142
- if (p) tmp = x & 0x0000ff00u;
143
- if (p) idx -= pop;
144
-
145
- return findLeadingOne(tmp) + bit - idx;
146
- }
147
-
148
- //------------------------------------------------------------------------
149
- // Single-sample implementation.
150
- //------------------------------------------------------------------------
151
-
152
- __device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
153
- {
154
- atomicMin((U32*)pDepth, depth);
155
- __syncwarp(ropMask);
156
- bool act = (depth == *pDepth);
157
- __syncwarp(ropMask);
158
- U32 actMask = __ballot_sync(ropMask, act);
159
- if (act)
160
- {
161
- *pDepth = 0;
162
- __syncwarp(actMask);
163
- atomicMax((U32*)pDepth, threadIdx.x);
164
- __syncwarp(actMask);
165
- if (*pDepth == threadIdx.x)
166
- {
167
- *pDepth = depth;
168
- *pColor = color;
169
- }
170
- __syncwarp(actMask);
171
- }
172
- }
173
-
174
- //------------------------------------------------------------------------
175
-
176
- __device__ __inline__ void fineRasterImpl(const CRParams p)
177
- {
178
- // for 20 warps:
179
- __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
180
- __shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
181
- __shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
182
- __shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
183
- __shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
184
- __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
185
- __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
186
- __shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
187
- // = 47.25KB total
188
-
189
- CRAtomics& atomics = p.atomics[blockIdx.z];
190
- const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
191
-
192
- const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
193
- const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
194
-
195
- volatile U32* tileColor = s_tileColor[threadIdx.y];
196
- volatile U32* tileDepth = s_tileDepth[threadIdx.y];
197
- volatile U32* tilePeel = s_tilePeel[threadIdx.y];
198
- volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
199
- volatile U64* triangleCov = s_triangleCov[threadIdx.y];
200
- volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
201
- volatile U32* temp = s_temp[threadIdx.y];
202
-
203
- if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
204
- return;
205
-
206
- temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
207
- cover8x8_setupLUT(s_cover8x8_lut);
208
- __syncthreads();
209
-
210
- // loop over tiles
211
- for (;;)
212
- {
213
- // pick a tile
214
- if (threadIdx.x == 0)
215
- temp[16] = atomicAdd(&atomics.fineCounter, 1);
216
- __syncwarp();
217
- int activeIdx = temp[16];
218
- if (activeIdx >= atomics.numActiveTiles)
219
- break;
220
-
221
- int tileIdx = activeTiles[activeIdx];
222
- S32 segment = tileFirstSeg[tileIdx];
223
- int tileY = tileIdx / p.widthTiles;
224
- int tileX = tileIdx - tileY * p.widthTiles;
225
- int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
226
- int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
227
-
228
- // initialize per-tile state
229
- int triRead = 0, triWrite = 0;
230
- int fragRead = 0, fragWrite = 0;
231
- if (threadIdx.x == 0)
232
- triangleFrag[63] = 0; // "previous triangle"
233
-
234
- // deferred clear => clear tile
235
- if (p.deferredClear)
236
- {
237
- tileColor[threadIdx.x] = p.clearColor;
238
- tileDepth[threadIdx.x] = p.clearDepth;
239
- tileColor[threadIdx.x + 32] = p.clearColor;
240
- tileDepth[threadIdx.x + 32] = p.clearDepth;
241
- }
242
- else // otherwise => read tile from framebuffer
243
- {
244
- U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
245
- U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
246
- tileColor[threadIdx.x] = pColor[px + p.strideX * py];
247
- tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
248
- tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
249
- tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
250
- }
251
-
252
- // read peeling inputs if enabled
253
- if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
254
- {
255
- U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
256
- tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
257
- tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
258
- }
259
-
260
- U32 tileZMax;
261
- bool tileZUpd;
262
- initTileZMax(tileZMax, tileZUpd, tileDepth);
263
-
264
- // process fragments
265
- for(;;)
266
- {
267
- // need to queue more fragments?
268
- if (fragWrite - fragRead < 32 && segment >= 0)
269
- {
270
- // update tile z - coherent over warp
271
- updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
272
-
273
- // read triangles
274
- do
275
- {
276
- // read triangle index and data, advance to next segment
277
- S32 triIdx, dataIdx;
278
- uint4 triHeader;
279
- getTriangle(p, triIdx, dataIdx, triHeader, segment);
280
-
281
- // early z cull
282
- if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
283
- triIdx = -1;
284
-
285
- // determine coverage
286
- U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
287
- S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
288
-
289
- // fragment count scan
290
- U32 frag = scan32_value(pop, temp);
291
- frag += fragWrite; // frag now holds cumulative fragment count
292
- fragWrite += scan32_total(temp);
293
-
294
- // queue non-empty triangles
295
- U32 goodMask = __ballot_sync(~0u, pop != 0);
296
- if (pop != 0)
297
- {
298
- int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
299
- triDataIdx [idx] = dataIdx;
300
- triangleFrag[idx] = frag;
301
- triangleCov [idx] = coverage;
302
- }
303
- triWrite += __popc(goodMask);
304
- }
305
- while (fragWrite - fragRead < 32 && segment >= 0);
306
- }
307
- __syncwarp();
308
-
309
- // end of segment?
310
- if (fragRead == fragWrite)
311
- break;
312
-
313
- // clear triangle boundaries
314
- temp[threadIdx.x + 16] = 0;
315
- __syncwarp();
316
-
317
- // tag triangle boundaries
318
- if (triRead + threadIdx.x < triWrite)
319
- {
320
- int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
321
- if (idx <= 32)
322
- temp[idx + 16 - 1] = 1;
323
- }
324
- __syncwarp();
325
-
326
- int ropLaneIdx = threadIdx.x;
327
- U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
328
-
329
- // distribute fragments
330
- bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
331
- U32 fragmentMask = __ballot_sync(~0u, hasFragment);
332
- if (hasFragment)
333
- {
334
- int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
335
- int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
336
- U64 coverage = triangleCov[triBufIdx];
337
- int pixelInTile = findBit(coverage, fragIdx);
338
- int dataIdx = triDataIdx[triBufIdx];
339
-
340
- // determine pixel position
341
- U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
342
- U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
343
-
344
- // depth test
345
- U32 depth = 0;
346
- uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
347
-
348
- depth = td.x * pixelX + td.y * pixelY + td.z;
349
- bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
350
- if (!zkill)
351
- {
352
- U32 oldDepth = tileDepth[pixelInTile];
353
- if (depth > oldDepth)
354
- zkill = true;
355
- else if (oldDepth == tileZMax)
356
- tileZUpd = true; // we are replacing previous zmax => need to update
357
- }
358
-
359
- U32 ropMask = __ballot_sync(fragmentMask, !zkill);
360
- if (!zkill)
361
- executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
362
- }
363
- // no need to sync, as next up is updateTileZMax that does internal warp sync
364
-
365
- // update counters
366
- fragRead = ::min(fragRead + 32, fragWrite);
367
- triRead += __popc(boundaryMask);
368
- }
369
-
370
- // Write tile back to the framebuffer.
371
- if (true)
372
- {
373
- int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
374
- int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
375
- U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
376
- U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
377
- pColor[px + p.strideX * py] = tileColor[threadIdx.x];
378
- pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
379
- pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
380
- pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
381
- }
382
- }
383
- }
384
-
385
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp DELETED
@@ -1,153 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include "Defs.hpp"
11
- #include "Constants.hpp"
12
-
13
- namespace CR
14
- {
15
- //------------------------------------------------------------------------
16
- // Projected triangle.
17
- //------------------------------------------------------------------------
18
-
19
- struct CRTriangleHeader
20
- {
21
- S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
22
- S16 v0y;
23
- S16 v1x;
24
- S16 v1y;
25
- S16 v2x;
26
- S16 v2y;
27
-
28
- U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
29
- };
30
-
31
- //------------------------------------------------------------------------
32
-
33
- struct CRTriangleData
34
- {
35
- U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
36
- U32 zy;
37
- U32 zb;
38
- U32 id; // Triangle id.
39
- };
40
-
41
- //------------------------------------------------------------------------
42
- // Device-side structures.
43
- //------------------------------------------------------------------------
44
-
45
- struct CRAtomics
46
- {
47
- // Setup.
48
- S32 numSubtris; // = numTris
49
-
50
- // Bin.
51
- S32 binCounter; // = 0
52
- S32 numBinSegs; // = 0
53
-
54
- // Coarse.
55
- S32 coarseCounter; // = 0
56
- S32 numTileSegs; // = 0
57
- S32 numActiveTiles; // = 0
58
-
59
- // Fine.
60
- S32 fineCounter; // = 0
61
- };
62
-
63
- //------------------------------------------------------------------------
64
-
65
- struct CRImageParams
66
- {
67
- S32 triOffset; // First triangle index to draw.
68
- S32 triCount; // Number of triangles to draw.
69
- S32 binBatchSize; // Number of triangles per batch.
70
- };
71
-
72
- //------------------------------------------------------------------------
73
-
74
- struct CRParams
75
- {
76
- // Common.
77
-
78
- CRAtomics* atomics; // Work counters. Per-image.
79
- S32 numImages; // Batch size.
80
- S32 totalCount; // In range mode, total number of triangles to render.
81
- S32 instanceMode; // 0 = range mode, 1 = instance mode.
82
-
83
- S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
84
- S32 numTriangles; // Number of triangles in input buffer.
85
- void* vertexBuffer; // numVertices * float4(x, y, z, w)
86
- void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
87
-
88
- S32 widthPixels; // Render buffer size in pixels. Must be multiple of tile size (8x8).
89
- S32 heightPixels;
90
- S32 widthPixelsVp; // Viewport size in pixels.
91
- S32 heightPixelsVp;
92
- S32 widthBins; // widthPixels / CR_BIN_SIZE
93
- S32 heightBins; // heightPixels / CR_BIN_SIZE
94
- S32 numBins; // widthBins * heightBins
95
-
96
- F32 xs; // Vertex position adjustments for tiled rendering.
97
- F32 ys;
98
- F32 xo;
99
- F32 yo;
100
-
101
- S32 widthTiles; // widthPixels / CR_TILE_SIZE
102
- S32 heightTiles; // heightPixels / CR_TILE_SIZE
103
- S32 numTiles; // widthTiles * heightTiles
104
-
105
- U32 renderModeFlags;
106
- S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
107
- U32 clearColor;
108
- U32 clearDepth;
109
-
110
- // These are uniform across batch.
111
-
112
- S32 maxSubtris;
113
- S32 maxBinSegs;
114
- S32 maxTileSegs;
115
-
116
- // Setup output / bin input.
117
-
118
- void* triSubtris; // maxSubtris * U8
119
- void* triHeader; // maxSubtris * CRTriangleHeader
120
- void* triData; // maxSubtris * CRTriangleData
121
-
122
- // Bin output / coarse input.
123
-
124
- void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
125
- void* binSegNext; // maxBinSegs * S32
126
- void* binSegCount; // maxBinSegs * S32
127
- void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
128
- void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
129
-
130
- // Coarse output / fine input.
131
-
132
- void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
133
- void* tileSegNext; // maxTileSegs * S32
134
- void* tileSegCount; // maxTileSegs * S32
135
- void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
136
- void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
137
-
138
- // Surface buffers. Outer tile offset is baked into pointers.
139
-
140
- void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
141
- void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
142
- void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
143
- S32 strideX; // horizontal size in pixels
144
- S32 strideY; // vertical stride in pixels
145
-
146
- // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
147
-
148
- CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
149
- const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
150
- };
151
-
152
- //------------------------------------------------------------------------
153
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp DELETED
@@ -1,370 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "../../framework.h"
10
- #include "PrivateDefs.hpp"
11
- #include "Constants.hpp"
12
- #include "RasterImpl.hpp"
13
- #include <cuda_runtime.h>
14
-
15
- using namespace CR;
16
- using std::min;
17
- using std::max;
18
-
19
- //------------------------------------------------------------------------
20
- // Kernel prototypes and variables.
21
-
22
- void triangleSetupKernel (const CRParams p);
23
- void binRasterKernel (const CRParams p);
24
- void coarseRasterKernel (const CRParams p);
25
- void fineRasterKernel (const CRParams p);
26
-
27
- //------------------------------------------------------------------------
28
-
29
- RasterImpl::RasterImpl(void)
30
- : m_renderModeFlags (0),
31
- m_deferredClear (false),
32
- m_clearColor (0),
33
- m_vertexPtr (NULL),
34
- m_indexPtr (NULL),
35
- m_numVertices (0),
36
- m_numTriangles (0),
37
- m_bufferSizesReported (0),
38
-
39
- m_numImages (0),
40
- m_bufferSizePixels (0, 0),
41
- m_bufferSizeVp (0, 0),
42
- m_sizePixels (0, 0),
43
- m_sizeVp (0, 0),
44
- m_offsetPixels (0, 0),
45
- m_sizeBins (0, 0),
46
- m_numBins (0),
47
- m_sizeTiles (0, 0),
48
- m_numTiles (0),
49
-
50
- m_numSMs (1),
51
- m_numCoarseBlocksPerSM (1),
52
- m_numFineBlocksPerSM (1),
53
- m_numFineWarpsPerBlock (1),
54
-
55
- m_maxSubtris (1),
56
- m_maxBinSegs (1),
57
- m_maxTileSegs (1)
58
- {
59
- // Query relevant device attributes.
60
-
61
- int currentDevice = 0;
62
- NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
63
- NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
64
- cudaFuncAttributes attr;
65
- NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
66
- m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
67
- NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
68
- NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
69
-
70
- // Setup functions.
71
-
72
- NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
73
- NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
74
- NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
75
- NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
76
- }
77
-
78
- //------------------------------------------------------------------------
79
-
80
- RasterImpl::~RasterImpl(void)
81
- {
82
- // Empty.
83
- }
84
-
85
- //------------------------------------------------------------------------
86
-
87
- void RasterImpl::setBufferSize(Vec3i size)
88
- {
89
- // Internal buffer width and height must be divisible by tile size.
90
- int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
91
- int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
92
-
93
- m_bufferSizePixels = Vec2i(w, h);
94
- m_bufferSizeVp = Vec2i(size.x, size.y);
95
- m_numImages = size.z;
96
-
97
- m_colorBuffer.reset(w * h * size.z * sizeof(U32));
98
- m_depthBuffer.reset(w * h * size.z * sizeof(U32));
99
- }
100
-
101
- //------------------------------------------------------------------------
102
-
103
- void RasterImpl::setViewport(Vec2i size, Vec2i offset)
104
- {
105
- // Offset must be divisible by tile size.
106
- NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
107
-
108
- // Round internal viewport size to multiples of tile size.
109
- int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
110
- int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
111
-
112
- m_sizePixels = Vec2i(w, h);
113
- m_offsetPixels = offset;
114
- m_sizeVp = Vec2i(size.x, size.y);
115
- m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
116
- m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
117
- m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
118
- m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
119
- m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
120
- m_numBins = m_sizeBins.x * m_sizeBins.y;
121
- }
122
-
123
- void RasterImpl::swapDepthAndPeel(void)
124
- {
125
- m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
126
-
127
- void* tmp = m_depthBuffer.getPtr();
128
- m_depthBuffer.setPtr(m_peelBuffer.getPtr());
129
- m_peelBuffer.setPtr(tmp);
130
- }
131
-
132
- //------------------------------------------------------------------------
133
-
134
- bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
135
- {
136
- bool instanceMode = (!ranges);
137
-
138
- int maxSubtrisSlack = 4096; // x 81B = 324KB
139
- int maxBinSegsSlack = 256; // x 2137B = 534KB
140
- int maxTileSegsSlack = 4096; // x 136B = 544KB
141
-
142
- // Resize atomics as needed.
143
- m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
144
- m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
145
-
146
- // Size of these buffers doesn't depend on input.
147
- m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
148
- m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
149
- m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
150
- m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
151
-
152
- // Construct per-image parameters and determine worst-case buffer sizes.
153
- m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
154
- CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
155
- for (int i=0; i < m_numImages; i++)
156
- {
157
- CRImageParams& ip = imageParams[i];
158
-
159
- int roundSize = CR_BIN_WARPS * 32;
160
- int minBatches = CR_BIN_STREAMS_SIZE * 2;
161
- int maxRounds = 32;
162
-
163
- ip.triOffset = instanceMode ? 0 : ranges[i].x;
164
- ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
165
- ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
166
-
167
- m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
168
- m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
169
- m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
170
- }
171
-
172
- // Retry until successful.
173
-
174
- for (;;)
175
- {
176
- // Allocate buffers.
177
- m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
178
- m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
179
- m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
180
-
181
- m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
182
- m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
183
- m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
184
-
185
- m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
186
- m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
187
- m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
188
-
189
- // Report if buffers grow from last time.
190
- size_t sizesTotal = getTotalBufferSizes();
191
- if (sizesTotal > m_bufferSizesReported)
192
- {
193
- size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
194
- sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
195
- LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
196
- m_bufferSizesReported = sizesMB << 20;
197
- }
198
-
199
- // Launch stages. Blocks until everything is done.
200
- launchStages(instanceMode, peel, stream);
201
-
202
- // Peeling iteration cannot fail, so no point checking things further.
203
- if (peel)
204
- break;
205
-
206
- // Atomics after coarse stage are now available.
207
- CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
208
-
209
- // Success?
210
- bool failed = false;
211
- for (int i=0; i < m_numImages; i++)
212
- {
213
- const CRAtomics& a = atomics[i];
214
- failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
215
- }
216
- if (!failed)
217
- break; // Success!
218
-
219
- // If we were already at maximum capacity, no can do.
220
- if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
221
- return false;
222
-
223
- // Enlarge buffers and try again.
224
- for (int i=0; i < m_numImages; i++)
225
- {
226
- const CRAtomics& a = atomics[i];
227
- m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
228
- m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
229
- m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
230
- }
231
- }
232
-
233
- m_deferredClear = false;
234
- return true; // Success.
235
- }
236
-
237
- //------------------------------------------------------------------------
238
-
239
- size_t RasterImpl::getTotalBufferSizes(void) const
240
- {
241
- return
242
- m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
243
- m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
244
- m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
245
- m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
246
- }
247
-
248
- //------------------------------------------------------------------------
249
-
250
- void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
251
- {
252
- CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
253
-
254
- // Unless peeling, initialize atomics to mostly zero.
255
- CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
256
- if (!peel)
257
- {
258
- memset(atomics, 0, m_numImages * sizeof(CRAtomics));
259
- for (int i=0; i < m_numImages; i++)
260
- atomics[i].numSubtris = imageParams[i].triCount;
261
- }
262
-
263
- // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
264
- NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
265
-
266
- // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
267
- if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
268
- {
269
- int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
270
- m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
271
- NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
272
- }
273
-
274
- // Set global parameters.
275
- CRParams p;
276
- {
277
- p.atomics = (CRAtomics*)m_crAtomics.getPtr();
278
- p.numImages = m_numImages;
279
- p.totalCount = 0; // Only relevant in range mode.
280
- p.instanceMode = instanceMode ? 1 : 0;
281
-
282
- p.numVertices = m_numVertices;
283
- p.numTriangles = m_numTriangles;
284
- p.vertexBuffer = m_vertexPtr;
285
- p.indexBuffer = m_indexPtr;
286
-
287
- p.widthPixels = m_sizePixels.x;
288
- p.heightPixels = m_sizePixels.y;
289
- p.widthPixelsVp = m_sizeVp.x;
290
- p.heightPixelsVp = m_sizeVp.y;
291
- p.widthBins = m_sizeBins.x;
292
- p.heightBins = m_sizeBins.y;
293
- p.numBins = m_numBins;
294
-
295
- p.xs = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
296
- p.ys = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
297
- p.xo = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
298
- p.yo = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
299
-
300
- p.widthTiles = m_sizeTiles.x;
301
- p.heightTiles = m_sizeTiles.y;
302
- p.numTiles = m_numTiles;
303
-
304
- p.renderModeFlags = m_renderModeFlags;
305
- p.deferredClear = m_deferredClear ? 1 : 0;
306
- p.clearColor = m_clearColor;
307
- p.clearDepth = CR_DEPTH_MAX;
308
-
309
- p.maxSubtris = m_maxSubtris;
310
- p.maxBinSegs = m_maxBinSegs;
311
- p.maxTileSegs = m_maxTileSegs;
312
-
313
- p.triSubtris = m_triSubtris.getPtr();
314
- p.triHeader = m_triHeader.getPtr();
315
- p.triData = m_triData.getPtr();
316
- p.binSegData = m_binSegData.getPtr();
317
- p.binSegNext = m_binSegNext.getPtr();
318
- p.binSegCount = m_binSegCount.getPtr();
319
- p.binFirstSeg = m_binFirstSeg.getPtr();
320
- p.binTotal = m_binTotal.getPtr();
321
- p.tileSegData = m_tileSegData.getPtr();
322
- p.tileSegNext = m_tileSegNext.getPtr();
323
- p.tileSegCount = m_tileSegCount.getPtr();
324
- p.activeTiles = m_activeTiles.getPtr();
325
- p.tileFirstSeg = m_tileFirstSeg.getPtr();
326
-
327
- size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
328
- p.colorBuffer = m_colorBuffer.getPtr(byteOffset);
329
- p.depthBuffer = m_depthBuffer.getPtr(byteOffset);
330
- p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
331
- p.strideX = m_bufferSizePixels.x;
332
- p.strideY = m_bufferSizePixels.y;
333
-
334
- memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
335
- p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
336
- }
337
-
338
- // Setup block sizes.
339
-
340
- dim3 brBlock(32, CR_BIN_WARPS);
341
- dim3 crBlock(32, CR_COARSE_WARPS);
342
- dim3 frBlock(32, m_numFineWarpsPerBlock);
343
- void* args[] = {&p};
344
-
345
- // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
346
- if (!peel)
347
- {
348
- if (instanceMode)
349
- {
350
- int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
351
- NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
352
- }
353
- else
354
- {
355
- for (int i=0; i < m_numImages; i++)
356
- p.totalCount += imageParams[i].triCount;
357
- int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
358
- NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
359
- }
360
- NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
361
- NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
362
- NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
363
- }
364
-
365
- // Fine rasterizer is launched always.
366
- NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
367
- NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
368
- }
369
-
370
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp DELETED
@@ -1,102 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include "PrivateDefs.hpp"
11
- #include "Buffer.hpp"
12
- #include "../CudaRaster.hpp"
13
-
14
- namespace CR
15
- {
16
- //------------------------------------------------------------------------
17
-
18
- class RasterImpl
19
- {
20
- public:
21
- RasterImpl (void);
22
- ~RasterImpl (void);
23
-
24
- void setBufferSize (Vec3i size);
25
- void setViewport (Vec2i size, Vec2i offset);
26
- void setRenderModeFlags (U32 flags) { m_renderModeFlags = flags; }
27
- void deferredClear (U32 color) { m_deferredClear = true; m_clearColor = color; }
28
- void setVertexBuffer (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
29
- void setIndexBuffer (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
30
- bool drawTriangles (const Vec2i* ranges, bool peel, cudaStream_t stream);
31
- void* getColorBuffer (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
32
- void* getDepthBuffer (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
33
- void swapDepthAndPeel (void);
34
- size_t getTotalBufferSizes (void) const;
35
-
36
- private:
37
- void launchStages (bool instanceMode, bool peel, cudaStream_t stream);
38
-
39
- // State.
40
-
41
- unsigned int m_renderModeFlags;
42
- bool m_deferredClear;
43
- unsigned int m_clearColor;
44
- void* m_vertexPtr;
45
- void* m_indexPtr;
46
- int m_numVertices; // Input buffer size.
47
- int m_numTriangles; // Input buffer size.
48
- size_t m_bufferSizesReported; // Previously reported buffer sizes.
49
-
50
- // Surfaces.
51
-
52
- Buffer m_colorBuffer;
53
- Buffer m_depthBuffer;
54
- Buffer m_peelBuffer;
55
- int m_numImages;
56
- Vec2i m_bufferSizePixels; // Internal buffer size.
57
- Vec2i m_bufferSizeVp; // Total viewport size.
58
- Vec2i m_sizePixels; // Internal size at which all computation is done, buffers reserved, etc.
59
- Vec2i m_sizeVp; // Size to which output will be cropped outside, determines viewport size.
60
- Vec2i m_offsetPixels; // Viewport offset for tiled rendering.
61
- Vec2i m_sizeBins;
62
- S32 m_numBins;
63
- Vec2i m_sizeTiles;
64
- S32 m_numTiles;
65
-
66
- // Launch sizes etc.
67
-
68
- S32 m_numSMs;
69
- S32 m_numCoarseBlocksPerSM;
70
- S32 m_numFineBlocksPerSM;
71
- S32 m_numFineWarpsPerBlock;
72
-
73
- // Global intermediate buffers. Individual images have offsets to these.
74
-
75
- Buffer m_crAtomics;
76
- HostBuffer m_crAtomicsHost;
77
- HostBuffer m_crImageParamsHost;
78
- Buffer m_crImageParamsExtra;
79
- Buffer m_triSubtris;
80
- Buffer m_triHeader;
81
- Buffer m_triData;
82
- Buffer m_binFirstSeg;
83
- Buffer m_binTotal;
84
- Buffer m_binSegData;
85
- Buffer m_binSegNext;
86
- Buffer m_binSegCount;
87
- Buffer m_activeTiles;
88
- Buffer m_tileFirstSeg;
89
- Buffer m_tileSegData;
90
- Buffer m_tileSegNext;
91
- Buffer m_tileSegCount;
92
-
93
- // Actual buffer sizes.
94
-
95
- S32 m_maxSubtris;
96
- S32 m_maxBinSegs;
97
- S32 m_maxTileSegs;
98
- };
99
-
100
- //------------------------------------------------------------------------
101
- } // namespace CR
102
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl_.cu DELETED
@@ -1,37 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "../CudaRaster.hpp"
10
- #include "PrivateDefs.hpp"
11
- #include "Constants.hpp"
12
- #include "Util.inl"
13
-
14
- namespace CR
15
- {
16
-
17
- //------------------------------------------------------------------------
18
- // Stage implementations.
19
- //------------------------------------------------------------------------
20
-
21
- #include "TriangleSetup.inl"
22
- #include "BinRaster.inl"
23
- #include "CoarseRaster.inl"
24
- #include "FineRaster.inl"
25
-
26
- }
27
-
28
- //------------------------------------------------------------------------
29
- // Stage entry points.
30
- //------------------------------------------------------------------------
31
-
32
- __global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS) triangleSetupKernel (const CR::CRParams p) { CR::triangleSetupImpl(p); }
33
- __global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1) binRasterKernel (const CR::CRParams p) { CR::binRasterImpl(p); }
34
- __global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1) coarseRasterKernel (const CR::CRParams p) { CR::coarseRasterImpl(p); }
35
- __global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1) fineRasterKernel (const CR::CRParams p) { CR::fineRasterImpl(p); }
36
-
37
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl DELETED
@@ -1,402 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
-
11
- __device__ __inline__ void snapTriangle(
12
- const CRParams& p,
13
- float4 v0, float4 v1, float4 v2,
14
- int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
15
- {
16
- F32 viewScaleX = (F32)(p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
17
- F32 viewScaleY = (F32)(p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
18
- rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
19
- p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
20
- p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
21
- p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
22
- lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
23
- hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
24
- }
25
-
26
- //------------------------------------------------------------------------
27
-
28
- __device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
29
- {
30
- U32 flips = 0;
31
- if (dy > 0 || (dy == 0 && dx <= 0))
32
- flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
33
- if (dx > 0)
34
- flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
35
- if (::abs(dx) < ::abs(dy))
36
- flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
37
- return flips;
38
- }
39
-
40
- //------------------------------------------------------------------------
41
-
42
- __device__ __inline__ bool prepareTriangle(
43
- const CRParams& p,
44
- int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
45
- int2& d1, int2& d2, S32& area)
46
- {
47
- // Backfacing or degenerate => cull.
48
-
49
- d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
50
- d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
51
- area = d1.x * d2.y - d1.y * d2.x;
52
-
53
- if (area == 0)
54
- return false; // Degenerate.
55
-
56
- if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
57
- return false; // Backfacing.
58
-
59
- // AABB falls between samples => cull.
60
-
61
- int sampleSize = 1 << CR_SUBPIXEL_LOG2;
62
- int biasX = (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
63
- int biasY = (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
64
- int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
65
- int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
66
- int hix = (hi.x + biasX) & -sampleSize;
67
- int hiy = (hi.y + biasY) & -sampleSize;
68
-
69
- if (lox > hix || loy > hiy)
70
- return false; // Between pixels.
71
-
72
- // AABB covers 1 or 2 samples => cull if they are not covered.
73
-
74
- int diff = add_sub(hix, hiy, lox) - loy;
75
- if (diff <= sampleSize)
76
- {
77
- int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
78
- int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
79
- int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
80
- S32 e0 = t0.x * t1.y - t0.y * t1.x;
81
- S32 e1 = t1.x * t2.y - t1.y * t2.x;
82
- S32 e2 = t2.x * t0.y - t2.y * t0.x;
83
- if (area < 0)
84
- {
85
- e0 = -e0;
86
- e1 = -e1;
87
- e2 = -e2;
88
- }
89
-
90
- if (e0 < 0 || e1 < 0 || e2 < 0)
91
- {
92
- if (diff == 0)
93
- return false; // Between pixels.
94
-
95
- t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
96
- t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
97
- t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
98
- e0 = t0.x * t1.y - t0.y * t1.x;
99
- e1 = t1.x * t2.y - t1.y * t2.x;
100
- e2 = t2.x * t0.y - t2.y * t0.x;
101
- if (area < 0)
102
- {
103
- e0 = -e0;
104
- e1 = -e1;
105
- e2 = -e2;
106
- }
107
-
108
- if (e0 < 0 || e1 < 0 || e2 < 0)
109
- return false; // Between pixels.
110
- }
111
- }
112
-
113
- // Otherwise => proceed to output the triangle.
114
-
115
- return true; // Visible.
116
- }
117
-
118
- //------------------------------------------------------------------------
119
-
120
- __device__ __inline__ void setupTriangle(
121
- const CRParams& p,
122
- CRTriangleHeader* th, CRTriangleData* td, int triId,
123
- float v0z, float v1z, float v2z,
124
- int2 p0, int2 p1, int2 p2, float3 rcpW,
125
- int2 d1, int2 d2, S32 area)
126
- {
127
- // Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
128
- // disabled (if it is enabled, we never come here with area < 0).
129
-
130
- if (area < 0)
131
- {
132
- swap(d1, d2);
133
- swap(p1, p2);
134
- swap(v1z, v2z);
135
- swap(rcpW.y, rcpW.z);
136
- area = -area;
137
- }
138
-
139
- int2 wv0;
140
- wv0.x = p0.x + (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
141
- wv0.y = p0.y + (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
142
-
143
- // Setup depth plane equation.
144
-
145
- F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
146
- F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
147
- float3 zvert = make_float3(
148
- (v0z * zcoef) * rcpW.x + zbias,
149
- (v1z * zcoef) * rcpW.y + zbias,
150
- (v2z * zcoef) * rcpW.z + zbias
151
- );
152
- int2 zv0 = make_int2(
153
- wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
154
- wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
155
- );
156
- uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
157
-
158
- U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
159
-
160
- // Write CRTriangleData.
161
-
162
- *(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
163
-
164
- // Determine flipbits.
165
-
166
- U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
167
- U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
168
- U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
169
-
170
- // Write CRTriangleHeader.
171
-
172
- *(uint4*)th = make_uint4(
173
- prmt(p0.x, p0.y, 0x5410),
174
- prmt(p1.x, p1.y, 0x5410),
175
- prmt(p2.x, p2.y, 0x5410),
176
- (zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
177
- }
178
-
179
- //------------------------------------------------------------------------
180
-
181
- __device__ __inline__ void triangleSetupImpl(const CRParams p)
182
- {
183
- __shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
184
- F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
185
-
186
- // Compute task and image indices.
187
-
188
- int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
189
- int imageIdx = 0;
190
- if (p.instanceMode)
191
- {
192
- imageIdx = blockIdx.z;
193
- if (taskIdx >= p.numTriangles)
194
- return;
195
- }
196
- else
197
- {
198
- while (imageIdx < p.numImages)
199
- {
200
- int count = getImageParams(p, imageIdx).triCount;
201
- if (taskIdx < count)
202
- break;
203
- taskIdx -= count;
204
- imageIdx += 1;
205
- }
206
- if (imageIdx == p.numImages)
207
- return;
208
- }
209
-
210
- // Per-image data structures.
211
-
212
- const CRImageParams& ip = getImageParams(p, imageIdx);
213
- CRAtomics& atomics = p.atomics[imageIdx];
214
-
215
- const int* indexBuffer = (const int*)p.indexBuffer;
216
- U8* triSubtris = (U8*)p.triSubtris + imageIdx * p.maxSubtris;
217
- CRTriangleHeader* triHeader = (CRTriangleHeader*)p.triHeader + imageIdx * p.maxSubtris;
218
- CRTriangleData* triData = (CRTriangleData*)p.triData + imageIdx * p.maxSubtris;
219
-
220
- // Determine triangle index.
221
-
222
- int triIdx = taskIdx;
223
- if (!p.instanceMode)
224
- triIdx += ip.triOffset;
225
-
226
- // Read vertex indices.
227
-
228
- if ((U32)triIdx >= (U32)p.numTriangles)
229
- {
230
- // Bad triangle index.
231
- triSubtris[taskIdx] = 0;
232
- return;
233
- }
234
-
235
- uint4 vidx;
236
- vidx.x = indexBuffer[triIdx * 3 + 0];
237
- vidx.y = indexBuffer[triIdx * 3 + 1];
238
- vidx.z = indexBuffer[triIdx * 3 + 2];
239
- vidx.w = triIdx + 1; // Triangle index.
240
-
241
- if (vidx.x >= (U32)p.numVertices ||
242
- vidx.y >= (U32)p.numVertices ||
243
- vidx.z >= (U32)p.numVertices)
244
- {
245
- // Bad vertex index.
246
- triSubtris[taskIdx] = 0;
247
- return;
248
- }
249
-
250
- // Read vertex positions.
251
-
252
- const float4* vertexBuffer = (const float4*)p.vertexBuffer;
253
- if (p.instanceMode)
254
- vertexBuffer += p.numVertices * imageIdx; // Instance offset.
255
-
256
- float4 v0 = vertexBuffer[vidx.x];
257
- float4 v1 = vertexBuffer[vidx.y];
258
- float4 v2 = vertexBuffer[vidx.z];
259
-
260
- // Adjust vertex positions according to current viewport size and offset.
261
-
262
- v0.x = v0.x * p.xs + v0.w * p.xo;
263
- v0.y = v0.y * p.ys + v0.w * p.yo;
264
- v1.x = v1.x * p.xs + v1.w * p.xo;
265
- v1.y = v1.y * p.ys + v1.w * p.yo;
266
- v2.x = v2.x * p.xs + v2.w * p.xo;
267
- v2.y = v2.y * p.ys + v2.w * p.yo;
268
-
269
- // Outside view frustum => cull.
270
-
271
- if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
272
- {
273
- if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
274
- (v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
275
- (v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
276
- (v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
277
- (v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
278
- (v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
279
- {
280
- triSubtris[taskIdx] = 0;
281
- return;
282
- }
283
- }
284
-
285
- // Inside depth range => try to snap vertices.
286
-
287
- if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
288
- {
289
- // Inside S16 range and small enough => fast path.
290
- // Note: aabbLimit comes from the fact that cover8x8
291
- // does not support guardband with maximal viewport.
292
-
293
- int2 p0, p1, p2, lo, hi;
294
- float3 rcpW;
295
-
296
- snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
297
- S32 loxy = ::min(lo.x, lo.y);
298
- S32 hixy = ::max(hi.x, hi.y);
299
- S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
300
-
301
- if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
302
- {
303
- int2 d1, d2;
304
- S32 area;
305
- bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
306
- triSubtris[taskIdx] = res ? 1 : 0;
307
-
308
- if (res)
309
- setupTriangle(
310
- p,
311
- &triHeader[taskIdx], &triData[taskIdx], vidx.w,
312
- v0.z, v1.z, v2.z,
313
- p0, p1, p2, rcpW,
314
- d1, d2, area);
315
-
316
- return;
317
- }
318
- }
319
-
320
- // Clip to view frustum.
321
-
322
- float4 ov0 = v0;
323
- float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
324
- float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
325
- int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
326
-
327
- // Count non-culled subtriangles.
328
-
329
- v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
330
- v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
331
- v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
332
- v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
333
- v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
334
- v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
335
- v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
336
- v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
337
- float4 tv1 = v1;
338
-
339
- int numSubtris = 0;
340
- for (int i = 2; i < numVerts; i++)
341
- {
342
- v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
343
- v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
344
- v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
345
- v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
346
-
347
- int2 p0, p1, p2, lo, hi, d1, d2;
348
- float3 rcpW;
349
- S32 area;
350
-
351
- snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
352
- if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
353
- numSubtris++;
354
-
355
- v1 = v2;
356
- }
357
-
358
- triSubtris[taskIdx] = numSubtris;
359
-
360
- // Multiple subtriangles => allocate.
361
-
362
- int subtriBase = taskIdx;
363
- if (numSubtris > 1)
364
- {
365
- subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
366
- triHeader[taskIdx].misc = subtriBase;
367
- if (subtriBase + numSubtris > p.maxSubtris)
368
- numVerts = 0;
369
- }
370
-
371
- // Setup subtriangles.
372
-
373
- v1 = tv1;
374
- for (int i = 2; i < numVerts; i++)
375
- {
376
- v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
377
- v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
378
- v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
379
- v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
380
-
381
- int2 p0, p1, p2, lo, hi, d1, d2;
382
- float3 rcpW;
383
- S32 area;
384
-
385
- snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
386
- if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
387
- {
388
- setupTriangle(
389
- p,
390
- &triHeader[subtriBase], &triData[subtriBase], vidx.w,
391
- v0.z, v1.z, v2.z,
392
- p0, p1, p2, rcpW,
393
- d1, d2, area);
394
-
395
- subtriBase++;
396
- }
397
-
398
- v1 = v2;
399
- }
400
- }
401
-
402
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl DELETED
@@ -1,452 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "PrivateDefs.hpp"
10
-
11
- namespace CR
12
- {
13
- //------------------------------------------------------------------------
14
-
15
- template<class T> __device__ __inline__ void swap(T& a, T& b) { T t = a; a = b; b = t; }
16
-
17
- __device__ __inline__ U32 getLo (U64 a) { return __double2loint(__longlong_as_double(a)); }
18
- __device__ __inline__ S32 getLo (S64 a) { return __double2loint(__longlong_as_double(a)); }
19
- __device__ __inline__ U32 getHi (U64 a) { return __double2hiint(__longlong_as_double(a)); }
20
- __device__ __inline__ S32 getHi (S64 a) { return __double2hiint(__longlong_as_double(a)); }
21
- __device__ __inline__ U64 combineLoHi (U32 lo, U32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
22
- __device__ __inline__ S64 combineLoHi (S32 lo, S32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
23
- __device__ __inline__ U32 getLaneMaskLt (void) { U32 r; asm("mov.u32 %0, %lanemask_lt;" : "=r"(r)); return r; }
24
- __device__ __inline__ U32 getLaneMaskLe (void) { U32 r; asm("mov.u32 %0, %lanemask_le;" : "=r"(r)); return r; }
25
- __device__ __inline__ U32 getLaneMaskGt (void) { U32 r; asm("mov.u32 %0, %lanemask_gt;" : "=r"(r)); return r; }
26
- __device__ __inline__ U32 getLaneMaskGe (void) { U32 r; asm("mov.u32 %0, %lanemask_ge;" : "=r"(r)); return r; }
27
- __device__ __inline__ int findLeadingOne (U32 v) { U32 r; asm("bfind.u32 %0, %1;" : "=r"(r) : "r"(v)); return r; }
28
- __device__ __inline__ bool singleLane (void) { return ((::__ballot_sync(~0u, true) & getLaneMaskLt()) == 0); }
29
-
30
- __device__ __inline__ void add_add_carry (U32& rlo, U32 alo, U32 blo, U32& rhi, U32 ahi, U32 bhi) { U64 r = combineLoHi(alo, ahi) + combineLoHi(blo, bhi); rlo = getLo(r); rhi = getHi(r); }
31
- __device__ __inline__ S32 f32_to_s32_sat (F32 a) { S32 v; asm("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
32
- __device__ __inline__ U32 f32_to_u32_sat (F32 a) { U32 v; asm("cvt.rni.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
33
- __device__ __inline__ U32 f32_to_u32_sat_rmi (F32 a) { U32 v; asm("cvt.rmi.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
34
- __device__ __inline__ U32 f32_to_u8_sat (F32 a) { U32 v; asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
35
- __device__ __inline__ S64 f32_to_s64 (F32 a) { S64 v; asm("cvt.rni.s64.f32 %0, %1;" : "=l"(v) : "f"(a)); return v; }
36
- __device__ __inline__ S32 add_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
37
- __device__ __inline__ S32 add_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
38
- __device__ __inline__ S32 add_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
39
- __device__ __inline__ S32 add_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
40
- __device__ __inline__ S32 sub_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
41
- __device__ __inline__ S32 sub_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
42
- __device__ __inline__ S32 sub_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
43
- __device__ __inline__ S32 sub_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
44
- __device__ __inline__ S32 sub_u16lo_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
45
- __device__ __inline__ S32 sub_u16hi_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
46
- __device__ __inline__ S32 sub_u16lo_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
47
- __device__ __inline__ S32 sub_u16hi_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
48
- __device__ __inline__ U32 add_b0 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b0, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
49
- __device__ __inline__ U32 add_b1 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
50
- __device__ __inline__ U32 add_b2 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b2, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
51
- __device__ __inline__ U32 add_b3 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b3, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
52
- __device__ __inline__ U32 vmad_b0 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
53
- __device__ __inline__ U32 vmad_b1 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
54
- __device__ __inline__ U32 vmad_b2 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
55
- __device__ __inline__ U32 vmad_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
56
- __device__ __inline__ U32 vmad_b0_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
57
- __device__ __inline__ U32 vmad_b1_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
58
- __device__ __inline__ U32 vmad_b2_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
59
- __device__ __inline__ U32 vmad_b3_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
60
- __device__ __inline__ U32 add_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vadd.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
61
- __device__ __inline__ U32 sub_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vsub.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
62
- __device__ __inline__ S32 max_max (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
63
- __device__ __inline__ S32 min_min (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
64
- __device__ __inline__ S32 max_add (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
65
- __device__ __inline__ S32 min_add (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
66
- __device__ __inline__ U32 add_add (U32 a, U32 b, U32 c) { U32 v; asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
67
- __device__ __inline__ U32 sub_add (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
68
- __device__ __inline__ U32 add_sub (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(c), "r"(b)); return v; }
69
- __device__ __inline__ S32 add_clamp_0_x (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
70
- __device__ __inline__ S32 add_clamp_b0 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
71
- __device__ __inline__ S32 add_clamp_b2 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b2, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
72
- __device__ __inline__ U32 prmt (U32 a, U32 b, U32 c) { U32 v; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
73
- __device__ __inline__ S32 u32lo_sext (U32 a) { U32 v; asm("cvt.s16.u32 %0, %1;" : "=r"(v) : "r"(a)); return v; }
74
- __device__ __inline__ U32 slct (U32 a, U32 b, S32 c) { U32 v; asm("slct.u32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
75
- __device__ __inline__ S32 slct (S32 a, S32 b, S32 c) { S32 v; asm("slct.s32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
76
- __device__ __inline__ F32 slct (F32 a, F32 b, S32 c) { F32 v; asm("slct.f32.s32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "r"(c)); return v; }
77
- __device__ __inline__ U32 isetge (S32 a, S32 b) { U32 v; asm("set.ge.u32.s32 %0, %1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
78
- __device__ __inline__ F64 rcp_approx (F64 a) { F64 v; asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(v) : "d"(a)); return v; }
79
- __device__ __inline__ F32 fma_rm (F32 a, F32 b, F32 c) { F32 v; asm("fma.rm.f32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "f"(c)); return v; }
80
- __device__ __inline__ U32 idiv_fast (U32 a, U32 b);
81
-
82
- __device__ __inline__ uint3 setupPleq (float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp);
83
-
84
- __device__ __inline__ void cover8x8_setupLUT (volatile U64* lut);
85
- __device__ __inline__ U64 cover8x8_exact_fast (S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut); // Assumes viewport <= 2^11, subpixels <= 2^4, no guardband.
86
- __device__ __inline__ U64 cover8x8_lookupMask (S64 yinit, U32 yinc, U32 flips, volatile const U64* lut);
87
-
88
- __device__ __inline__ U64 cover8x8_exact_noLUT (S32 ox, S32 oy, S32 dx, S32 dy); // optimized reference implementation, does not require look-up table
89
- __device__ __inline__ U64 cover8x8_conservative_noLUT (S32 ox, S32 oy, S32 dx, S32 dy);
90
- __device__ __inline__ U64 cover8x8_generateMask_noLUT (S32 curr, S32 dx, S32 dy);
91
-
92
- template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems); // Assumes that numItems <= threadsInBlock. Must sync before & after the call.
93
-
94
- __device__ __inline__ const CRImageParams& getImageParams(const CRParams& p, int idx)
95
- {
96
- return (idx < CR_EMBED_IMAGE_PARAMS) ? p.imageParamsFirst[idx] : p.imageParamsExtra[idx - CR_EMBED_IMAGE_PARAMS];
97
- }
98
-
99
- //------------------------------------------------------------------------
100
-
101
- __device__ __inline__ int clipPolygonWithPlane(F32* baryOut, const F32* baryIn, int numIn, F32 v0, F32 v1, F32 v2)
102
- {
103
- int numOut = 0;
104
- if (numIn >= 3)
105
- {
106
- int ai = (numIn - 1) * 2;
107
- F32 av = v0 + v1 * baryIn[ai + 0] + v2 * baryIn[ai + 1];
108
- for (int bi = 0; bi < numIn * 2; bi += 2)
109
- {
110
- F32 bv = v0 + v1 * baryIn[bi + 0] + v2 * baryIn[bi + 1];
111
- if (av * bv < 0.0f)
112
- {
113
- F32 bc = av / (av - bv);
114
- F32 ac = 1.0f - bc;
115
- baryOut[numOut + 0] = baryIn[ai + 0] * ac + baryIn[bi + 0] * bc;
116
- baryOut[numOut + 1] = baryIn[ai + 1] * ac + baryIn[bi + 1] * bc;
117
- numOut += 2;
118
- }
119
- if (bv >= 0.0f)
120
- {
121
- baryOut[numOut + 0] = baryIn[bi + 0];
122
- baryOut[numOut + 1] = baryIn[bi + 1];
123
- numOut += 2;
124
- }
125
- ai = bi;
126
- av = bv;
127
- }
128
- }
129
- return (numOut >> 1);
130
- }
131
-
132
- //------------------------------------------------------------------------
133
-
134
- __device__ __inline__ int clipTriangleWithFrustum(F32* bary, const F32* v0, const F32* v1, const F32* v2, const F32* d1, const F32* d2)
135
- {
136
- int num = 3;
137
- bary[0] = 0.0f, bary[1] = 0.0f;
138
- bary[2] = 1.0f, bary[3] = 0.0f;
139
- bary[4] = 0.0f, bary[5] = 1.0f;
140
-
141
- if ((v0[3] < fabsf(v0[0])) | (v1[3] < fabsf(v1[0])) | (v2[3] < fabsf(v2[0])))
142
- {
143
- F32 temp[18];
144
- num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[0], d1[3] + d1[0], d2[3] + d2[0]);
145
- num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[0], d1[3] - d1[0], d2[3] - d2[0]);
146
- }
147
- if ((v0[3] < fabsf(v0[1])) | (v1[3] < fabsf(v1[1])) | (v2[3] < fabsf(v2[1])))
148
- {
149
- F32 temp[18];
150
- num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[1], d1[3] + d1[1], d2[3] + d2[1]);
151
- num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[1], d1[3] - d1[1], d2[3] - d2[1]);
152
- }
153
- if ((v0[3] < fabsf(v0[2])) | (v1[3] < fabsf(v1[2])) | (v2[3] < fabsf(v2[2])))
154
- {
155
- F32 temp[18];
156
- num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[2], d1[3] + d1[2], d2[3] + d2[2]);
157
- num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[2], d1[3] - d1[2], d2[3] - d2[2]);
158
- }
159
- return num;
160
- }
161
-
162
- //------------------------------------------------------------------------
163
-
164
- __device__ __inline__ U32 idiv_fast(U32 a, U32 b)
165
- {
166
- return f32_to_u32_sat_rmi(((F32)a + 0.5f) / (F32)b);
167
- }
168
-
169
- //------------------------------------------------------------------------
170
-
171
- __device__ __inline__ U32 toABGR(float4 color)
172
- {
173
- // 11 instructions: 4*FFMA, 4*F2I, 3*PRMT
174
- U32 x = f32_to_u32_sat_rmi(fma_rm(color.x, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
175
- U32 y = f32_to_u32_sat_rmi(fma_rm(color.y, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
176
- U32 z = f32_to_u32_sat_rmi(fma_rm(color.z, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
177
- U32 w = f32_to_u32_sat_rmi(fma_rm(color.w, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
178
- return prmt(prmt(x, y, 0x0073), prmt(z, w, 0x0073), 0x5410);
179
- }
180
-
181
- //------------------------------------------------------------------------
182
- // v0 = subpixels relative to the bottom-left sampling point
183
-
184
- __device__ __inline__ uint3 setupPleq(float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp)
185
- {
186
- F32 mx = fmaxf(fmaxf(values.x, values.y), values.z);
187
- int sh = ::min(::max((__float_as_int(mx) >> 23) - (127 + 22), 0), 8);
188
- S32 t0 = (U32)values.x >> sh;
189
- S32 t1 = ((U32)values.y >> sh) - t0;
190
- S32 t2 = ((U32)values.z >> sh) - t0;
191
-
192
- U32 rcpMant = (__float_as_int(areaRcp) & 0x007FFFFF) | 0x00800000;
193
- int rcpShift = (23 + 127) - (__float_as_int(areaRcp) >> 23);
194
-
195
- uint3 pleq;
196
- S64 xc = ((S64)t1 * d2.y - (S64)t2 * d1.y) * rcpMant;
197
- S64 yc = ((S64)t2 * d1.x - (S64)t1 * d2.x) * rcpMant;
198
- pleq.x = (U32)(xc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
199
- pleq.y = (U32)(yc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
200
-
201
- S32 centerX = (v0.x * 2 + min_min(d1.x, d2.x, 0) + max_max(d1.x, d2.x, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
202
- S32 centerY = (v0.y * 2 + min_min(d1.y, d2.y, 0) + max_max(d1.y, d2.y, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
203
- S32 vcx = v0.x - (centerX << CR_SUBPIXEL_LOG2);
204
- S32 vcy = v0.y - (centerY << CR_SUBPIXEL_LOG2);
205
-
206
- pleq.z = t0 << sh;
207
- pleq.z -= (U32)(((xc >> 13) * vcx + (yc >> 13) * vcy) >> (rcpShift - (sh + 13)));
208
- pleq.z -= pleq.x * centerX + pleq.y * centerY;
209
- return pleq;
210
- }
211
-
212
- //------------------------------------------------------------------------
213
-
214
- __device__ __inline__ void cover8x8_setupLUT(volatile U64* lut)
215
- {
216
- for (S32 lutIdx = threadIdx.x + blockDim.x * threadIdx.y; lutIdx < CR_COVER8X8_LUT_SIZE; lutIdx += blockDim.x * blockDim.y)
217
- {
218
- int half = (lutIdx < (12 << 5)) ? 0 : 1;
219
- int yint = (lutIdx >> 5) - half * 12 - 3;
220
- U32 shape = ((lutIdx >> 2) & 7) << (31 - 2);
221
- S32 slctSwapXY = lutIdx << (31 - 1);
222
- S32 slctNegX = lutIdx << (31 - 0);
223
- S32 slctCompl = slctSwapXY ^ slctNegX;
224
-
225
- U64 mask = 0;
226
- int xlo = half * 4;
227
- int xhi = xlo + 4;
228
- for (int x = xlo; x < xhi; x++)
229
- {
230
- int ylo = slct(0, ::max(yint, 0), slctCompl);
231
- int yhi = slct(::min(yint, 8), 8, slctCompl);
232
- for (int y = ylo; y < yhi; y++)
233
- {
234
- int xx = slct(x, y, slctSwapXY);
235
- int yy = slct(y, x, slctSwapXY);
236
- xx = slct(xx, 7 - xx, slctNegX);
237
- mask |= (U64)1 << (xx + yy * 8);
238
- }
239
- yint += shape >> 31;
240
- shape <<= 1;
241
- }
242
- lut[lutIdx] = mask;
243
- }
244
- }
245
-
246
- //------------------------------------------------------------------------
247
-
248
- __device__ __inline__ U64 cover8x8_exact_fast(S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut) // 52 instr
249
- {
250
- F32 yinitBias = (F32)(1 << (31 - CR_MAXVIEWPORT_LOG2 - CR_SUBPIXEL_LOG2 * 2));
251
- F32 yinitScale = (F32)(1 << (32 - CR_SUBPIXEL_LOG2));
252
- F32 yincScale = 65536.0f * 65536.0f;
253
-
254
- S32 slctFlipY = flips << (31 - CR_FLIPBIT_FLIP_Y);
255
- S32 slctFlipX = flips << (31 - CR_FLIPBIT_FLIP_X);
256
- S32 slctSwapXY = flips << (31 - CR_FLIPBIT_SWAP_XY);
257
-
258
- // Evaluate cross product.
259
-
260
- S32 t = ox * dy - oy * dx;
261
- F32 det = (F32)slct(t, t - dy * (7 << CR_SUBPIXEL_LOG2), slctFlipX);
262
- if (flips >= (1 << CR_FLIPBIT_COMPL))
263
- det = -det;
264
-
265
- // Represent Y as a function of X.
266
-
267
- F32 xrcp = 1.0f / (F32)::abs(slct(dx, dy, slctSwapXY));
268
- F32 yzero = det * yinitScale * xrcp + yinitBias;
269
- S64 yinit = f32_to_s64(slct(yzero, -yzero, slctFlipY));
270
- U32 yinc = f32_to_u32_sat((F32)::abs(slct(dy, dx, slctSwapXY)) * xrcp * yincScale);
271
-
272
- // Lookup.
273
-
274
- return cover8x8_lookupMask(yinit, yinc, flips, lut);
275
- }
276
-
277
- //------------------------------------------------------------------------
278
-
279
- __device__ __inline__ U64 cover8x8_lookupMask(S64 yinit, U32 yinc, U32 flips, volatile const U64* lut)
280
- {
281
- // First half.
282
-
283
- U32 yfrac = getLo(yinit);
284
- U32 shape = add_clamp_0_x(getHi(yinit) + 4, 0, 11);
285
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
286
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
287
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
288
- int oct = flips & ((1 << CR_FLIPBIT_FLIP_X) | (1 << CR_FLIPBIT_SWAP_XY));
289
- U64 mask = *(U64*)((U8*)lut + oct + (shape << 5));
290
-
291
- // Second half.
292
-
293
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
294
- shape = add_clamp_0_x(getHi(yinit) + 4, __popc(shape & 15), 11);
295
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
296
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
297
- add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
298
- mask |= *(U64*)((U8*)lut + oct + (shape << 5) + (12 << 8));
299
- return (flips >= (1 << CR_FLIPBIT_COMPL)) ? ~mask : mask;
300
- }
301
-
302
- //------------------------------------------------------------------------
303
-
304
- __device__ __inline__ U64 cover8x8_exact_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
305
- {
306
- S32 curr = ox * dy - oy * dx;
307
- if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
308
- return cover8x8_generateMask_noLUT(curr, dx, dy);
309
- }
310
-
311
- //------------------------------------------------------------------------
312
-
313
- __device__ __inline__ U64 cover8x8_conservative_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
314
- {
315
- S32 curr = ox * dy - oy * dx;
316
- if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
317
- curr += (::abs(dx) + ::abs(dy)) << (CR_SUBPIXEL_LOG2 - 1);
318
- return cover8x8_generateMask_noLUT(curr, dx, dy);
319
- }
320
-
321
- //------------------------------------------------------------------------
322
-
323
- __device__ __inline__ U64 cover8x8_generateMask_noLUT(S32 curr, S32 dx, S32 dy)
324
- {
325
- curr += (dx - dy) * (7 << CR_SUBPIXEL_LOG2);
326
- S32 stepX = dy << (CR_SUBPIXEL_LOG2 + 1);
327
- S32 stepYorig = -dx - dy * 7;
328
- S32 stepY = stepYorig << (CR_SUBPIXEL_LOG2 + 1);
329
-
330
- U32 hi = isetge(curr, 0);
331
- U32 frac = curr + curr;
332
- for (int i = 62; i >= 32; i--)
333
- add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, hi, hi, hi);
334
-
335
- U32 lo = 0;
336
- for (int i = 31; i >= 0; i--)
337
- add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, lo, lo, lo);
338
-
339
- lo ^= lo >> 1, hi ^= hi >> 1;
340
- lo ^= lo >> 2, hi ^= hi >> 2;
341
- lo ^= lo >> 4, hi ^= hi >> 4;
342
- lo ^= lo >> 8, hi ^= hi >> 8;
343
- lo ^= lo >> 16, hi ^= hi >> 16;
344
-
345
- if (dy < 0)
346
- {
347
- lo ^= 0x55AA55AA;
348
- hi ^= 0x55AA55AA;
349
- }
350
- if (stepYorig < 0)
351
- {
352
- lo ^= 0xFF00FF00;
353
- hi ^= 0x00FF00FF;
354
- }
355
- if ((hi & 1) != 0)
356
- lo = ~lo;
357
-
358
- return combineLoHi(lo, hi);
359
- }
360
-
361
- //------------------------------------------------------------------------
362
-
363
- template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems)
364
- {
365
- int thrInBlock = threadIdx.x + threadIdx.y * blockDim.x;
366
- int range = 16;
367
-
368
- // Use transposition sort within each 16-wide subrange.
369
-
370
- int base = thrInBlock * 2;
371
- bool act = (base < numItems - 1);
372
- U32 actMask = __ballot_sync(~0u, act);
373
- if (act)
374
- {
375
- bool tryOdd = (base < numItems - 2 && (~base & (range - 2)) != 0);
376
- T mid = ptr[base + 1];
377
-
378
- for (int iter = 0; iter < range; iter += 2)
379
- {
380
- // Evens.
381
-
382
- T tmp = ptr[base + 0];
383
- if (tmp > mid)
384
- {
385
- ptr[base + 0] = mid;
386
- mid = tmp;
387
- }
388
- __syncwarp(actMask);
389
-
390
- // Odds.
391
-
392
- if (tryOdd)
393
- {
394
- tmp = ptr[base + 2];
395
- if (mid > tmp)
396
- {
397
- ptr[base + 2] = mid;
398
- mid = tmp;
399
- }
400
- }
401
- __syncwarp(actMask);
402
- }
403
- ptr[base + 1] = mid;
404
- }
405
-
406
- // Multiple subranges => Merge hierarchically.
407
-
408
- for (; range < numItems; range <<= 1)
409
- {
410
- // Assuming that we would insert the current item into the other
411
- // subrange, use binary search to find the appropriate slot.
412
-
413
- __syncthreads();
414
-
415
- T item;
416
- int slot;
417
- if (thrInBlock < numItems)
418
- {
419
- item = ptr[thrInBlock];
420
- slot = (thrInBlock & -range) ^ range;
421
- if (slot < numItems)
422
- {
423
- T tmp = ptr[slot];
424
- bool inclusive = ((thrInBlock & range) != 0);
425
- if (tmp < item || (inclusive && tmp == item))
426
- {
427
- for (int step = (range >> 1); step != 0; step >>= 1)
428
- {
429
- int probe = slot + step;
430
- if (probe < numItems)
431
- {
432
- tmp = ptr[probe];
433
- if (tmp < item || (inclusive && tmp == item))
434
- slot = probe;
435
- }
436
- }
437
- slot++;
438
- }
439
- }
440
- }
441
-
442
- // Store the item at an appropriate place.
443
-
444
- __syncthreads();
445
-
446
- if (thrInBlock < numItems)
447
- ptr[slot + (thrInBlock & (range * 2 - 1)) - range] = item;
448
- }
449
- }
450
-
451
- //------------------------------------------------------------------------
452
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/framework.h DELETED
@@ -1,49 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
-
11
- // Framework-specific macros to enable code sharing.
12
-
13
- //------------------------------------------------------------------------
14
- // Tensorflow.
15
-
16
- #ifdef NVDR_TENSORFLOW
17
- #define EIGEN_USE_GPU
18
- #include "tensorflow/core/framework/op.h"
19
- #include "tensorflow/core/framework/op_kernel.h"
20
- #include "tensorflow/core/framework/shape_inference.h"
21
- #include "tensorflow/core/platform/default/logging.h"
22
- using namespace tensorflow;
23
- using namespace tensorflow::shape_inference;
24
- #define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
25
- #define NVDR_CTX_PARAMS _nvdr_ctx
26
- #define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
27
- #define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
28
- #define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
29
- #endif
30
-
31
- //------------------------------------------------------------------------
32
- // PyTorch.
33
-
34
- #ifdef NVDR_TORCH
35
- #ifndef __CUDACC__
36
- #include <torch/extension.h>
37
- #include <ATen/cuda/CUDAContext.h>
38
- #include <ATen/cuda/CUDAUtils.h>
39
- #include <c10/cuda/CUDAGuard.h>
40
- #include <pybind11/numpy.h>
41
- #endif
42
- #define NVDR_CTX_ARGS int _nvdr_ctx_dummy
43
- #define NVDR_CTX_PARAMS 0
44
- #define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
45
- #define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
46
- #define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
47
- #endif
48
-
49
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/glutil.cpp DELETED
@@ -1,403 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
- // Common.
11
- //------------------------------------------------------------------------
12
-
13
- #include "framework.h"
14
- #include "glutil.h"
15
- #include <iostream>
16
- #include <iomanip>
17
-
18
- // Create the function pointers.
19
- #define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
20
- #include "glutil_extlist.h"
21
- #undef GLUTIL_EXT
22
-
23
- // Track initialization status.
24
- static volatile bool s_glExtInitialized = false;
25
-
26
- // Error strings.
27
- const char* getGLErrorString(GLenum err)
28
- {
29
- switch(err)
30
- {
31
- case GL_NO_ERROR: return "GL_NO_ERROR";
32
- case GL_INVALID_ENUM: return "GL_INVALID_ENUM";
33
- case GL_INVALID_VALUE: return "GL_INVALID_VALUE";
34
- case GL_INVALID_OPERATION: return "GL_INVALID_OPERATION";
35
- case GL_STACK_OVERFLOW: return "GL_STACK_OVERFLOW";
36
- case GL_STACK_UNDERFLOW: return "GL_STACK_UNDERFLOW";
37
- case GL_OUT_OF_MEMORY: return "GL_OUT_OF_MEMORY";
38
- case GL_INVALID_FRAMEBUFFER_OPERATION: return "GL_INVALID_FRAMEBUFFER_OPERATION";
39
- case GL_TABLE_TOO_LARGE: return "GL_TABLE_TOO_LARGE";
40
- case GL_CONTEXT_LOST: return "GL_CONTEXT_LOST";
41
- }
42
- return "Unknown error";
43
- }
44
-
45
- //------------------------------------------------------------------------
46
- // Windows.
47
- //------------------------------------------------------------------------
48
-
49
- #ifdef _WIN32
50
-
51
- static CRITICAL_SECTION getInitializedCriticalSection(void)
52
- {
53
- CRITICAL_SECTION cs;
54
- InitializeCriticalSection(&cs);
55
- return cs;
56
- }
57
-
58
- static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
59
-
60
- static void safeGetProcAddress(const char* name, PROC* pfn)
61
- {
62
- PROC result = wglGetProcAddress(name);
63
- if (!result)
64
- {
65
- LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
66
- LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
67
- exit(1); // Should never get here but make sure we exit.
68
- }
69
- *pfn = result;
70
- }
71
-
72
- static void initializeGLExtensions(void)
73
- {
74
- // Use critical section for thread safety.
75
- EnterCriticalSection(&s_getProcAddressMutex);
76
-
77
- // Only dig function pointers if not done already.
78
- if (!s_glExtInitialized)
79
- {
80
- // Generate code to populate the function pointers.
81
- #define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
82
- #include "glutil_extlist.h"
83
- #undef GLUTIL_EXT
84
-
85
- // Mark as initialized.
86
- s_glExtInitialized = true;
87
- }
88
-
89
- // Done.
90
- LeaveCriticalSection(&s_getProcAddressMutex);
91
- return;
92
- }
93
-
94
- void setGLContext(GLContext& glctx)
95
- {
96
- if (!glctx.hglrc)
97
- LOG(FATAL) << "setGLContext() called with null gltcx";
98
- if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
99
- LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
100
-
101
- if (glctx.extInitialized)
102
- return;
103
- initializeGLExtensions();
104
- glctx.extInitialized = 1;
105
- }
106
-
107
- void releaseGLContext(void)
108
- {
109
- if (!wglMakeCurrent(NULL, NULL))
110
- LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
111
- }
112
-
113
- extern "C" int set_gpu(const char*); // In setgpu.lib
114
- GLContext createGLContext(int cudaDeviceIdx)
115
- {
116
- if (cudaDeviceIdx >= 0)
117
- {
118
- char pciBusId[256] = "";
119
- LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
120
- if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
121
- {
122
- LOG(INFO) << "PCI bus id query failed";
123
- }
124
- else
125
- {
126
- int res = set_gpu(pciBusId);
127
- LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
128
- }
129
- }
130
-
131
- HINSTANCE hInstance = GetModuleHandle(NULL);
132
- WNDCLASS wc = {};
133
- wc.style = CS_OWNDC;
134
- wc.lpfnWndProc = DefWindowProc;
135
- wc.hInstance = hInstance;
136
- wc.lpszClassName = "__DummyGLClassCPP";
137
- int res = RegisterClass(&wc);
138
-
139
- HWND hwnd = CreateWindow(
140
- "__DummyGLClassCPP", // lpClassName
141
- "__DummyGLWindowCPP", // lpWindowName
142
- WS_OVERLAPPEDWINDOW, // dwStyle
143
- CW_USEDEFAULT, // x
144
- CW_USEDEFAULT, // y
145
- 0, 0, // nWidth, nHeight
146
- NULL, NULL, // hWndParent, hMenu
147
- hInstance, // hInstance
148
- NULL // lpParam
149
- );
150
-
151
- PIXELFORMATDESCRIPTOR pfd = {};
152
- pfd.dwFlags = PFD_SUPPORT_OPENGL;
153
- pfd.iPixelType = PFD_TYPE_RGBA;
154
- pfd.iLayerType = PFD_MAIN_PLANE;
155
- pfd.cColorBits = 32;
156
- pfd.cDepthBits = 24;
157
- pfd.cStencilBits = 8;
158
-
159
- HDC hdc = GetDC(hwnd);
160
- int pixelformat = ChoosePixelFormat(hdc, &pfd);
161
- SetPixelFormat(hdc, pixelformat, &pfd);
162
-
163
- HGLRC hglrc = wglCreateContext(hdc);
164
- LOG(INFO) << std::hex << std::setfill('0')
165
- << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
166
- << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
167
-
168
- GLContext glctx = {hdc, hglrc, 0};
169
- return glctx;
170
- }
171
-
172
- void destroyGLContext(GLContext& glctx)
173
- {
174
- if (!glctx.hglrc)
175
- LOG(FATAL) << "destroyGLContext() called with null gltcx";
176
-
177
- // If this is the current context, release it.
178
- if (wglGetCurrentContext() == glctx.hglrc)
179
- releaseGLContext();
180
-
181
- HWND hwnd = WindowFromDC(glctx.hdc);
182
- if (!hwnd)
183
- LOG(FATAL) << "WindowFromDC() failed";
184
- if (!ReleaseDC(hwnd, glctx.hdc))
185
- LOG(FATAL) << "ReleaseDC() failed";
186
- if (!wglDeleteContext(glctx.hglrc))
187
- LOG(FATAL) << "wglDeleteContext() failed";
188
- if (!DestroyWindow(hwnd))
189
- LOG(FATAL) << "DestroyWindow() failed";
190
-
191
- LOG(INFO) << std::hex << std::setfill('0')
192
- << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
193
- << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
194
-
195
- memset(&glctx, 0, sizeof(GLContext));
196
- }
197
-
198
- #endif // _WIN32
199
-
200
- //------------------------------------------------------------------------
201
- // Linux.
202
- //------------------------------------------------------------------------
203
-
204
- #ifdef __linux__
205
-
206
- static pthread_mutex_t s_getProcAddressMutex;
207
-
208
- typedef void (*PROCFN)();
209
-
210
- static void safeGetProcAddress(const char* name, PROCFN* pfn)
211
- {
212
- PROCFN result = eglGetProcAddress(name);
213
- if (!result)
214
- {
215
- pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
216
- LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
217
- exit(1); // Should never get here but make sure we exit.
218
- }
219
- *pfn = result;
220
- }
221
-
222
- static void initializeGLExtensions(void)
223
- {
224
- pthread_mutex_lock(&s_getProcAddressMutex);
225
-
226
- // Only dig function pointers if not done already.
227
- if (!s_glExtInitialized)
228
- {
229
- // Generate code to populate the function pointers.
230
- #define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
231
- #include "glutil_extlist.h"
232
- #undef GLUTIL_EXT
233
-
234
- // Mark as initialized.
235
- s_glExtInitialized = true;
236
- }
237
-
238
- pthread_mutex_unlock(&s_getProcAddressMutex);
239
- return;
240
- }
241
-
242
- void setGLContext(GLContext& glctx)
243
- {
244
- if (!glctx.context)
245
- LOG(FATAL) << "setGLContext() called with null gltcx";
246
-
247
- if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
248
- LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
249
-
250
- if (glctx.extInitialized)
251
- return;
252
- initializeGLExtensions();
253
- glctx.extInitialized = 1;
254
- }
255
-
256
- void releaseGLContext(void)
257
- {
258
- EGLDisplay display = eglGetCurrentDisplay();
259
- if (display == EGL_NO_DISPLAY)
260
- LOG(WARNING) << "releaseGLContext() called with no active display";
261
- if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
262
- LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
263
- }
264
-
265
- static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
266
- {
267
- typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
268
- typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
269
- typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
270
-
271
- eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
272
- if (!eglQueryDevicesEXT)
273
- {
274
- LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
275
- return 0;
276
- }
277
-
278
- eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
279
- if (!eglQueryDeviceAttribEXT)
280
- {
281
- LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
282
- return 0;
283
- }
284
-
285
- eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
286
- if (!eglGetPlatformDisplayEXT)
287
- {
288
- LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
289
- return 0;
290
- }
291
-
292
- int num_devices = 0;
293
- eglQueryDevicesEXT(0, 0, &num_devices);
294
- if (!num_devices)
295
- return 0;
296
-
297
- EGLDisplay display = 0;
298
- EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
299
- eglQueryDevicesEXT(num_devices, devices, &num_devices);
300
- for (int i=0; i < num_devices; i++)
301
- {
302
- EGLDeviceEXT device = devices[i];
303
- intptr_t value = -1;
304
- if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
305
- {
306
- display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
307
- break;
308
- }
309
- }
310
-
311
- free(devices);
312
- return display;
313
- }
314
-
315
- GLContext createGLContext(int cudaDeviceIdx)
316
- {
317
- EGLDisplay display = 0;
318
-
319
- if (cudaDeviceIdx >= 0)
320
- {
321
- char pciBusId[256] = "";
322
- LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
323
- display = getCudaDisplay(cudaDeviceIdx);
324
- if (!display)
325
- LOG(INFO) << "Failed, falling back to default display";
326
- }
327
-
328
- if (!display)
329
- {
330
- display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
331
- if (display == EGL_NO_DISPLAY)
332
- LOG(FATAL) << "eglGetDisplay() failed";
333
- }
334
-
335
- EGLint major;
336
- EGLint minor;
337
- if (!eglInitialize(display, &major, &minor))
338
- LOG(FATAL) << "eglInitialize() failed";
339
-
340
- // Choose configuration.
341
-
342
- const EGLint context_attribs[] = {
343
- EGL_RED_SIZE, 8,
344
- EGL_GREEN_SIZE, 8,
345
- EGL_BLUE_SIZE, 8,
346
- EGL_ALPHA_SIZE, 8,
347
- EGL_DEPTH_SIZE, 24,
348
- EGL_STENCIL_SIZE, 8,
349
- EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
350
- EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
351
- EGL_NONE
352
- };
353
-
354
- EGLConfig config;
355
- EGLint num_config;
356
- if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
357
- LOG(FATAL) << "eglChooseConfig() failed";
358
-
359
- // Create GL context.
360
-
361
- if (!eglBindAPI(EGL_OPENGL_API))
362
- LOG(FATAL) << "eglBindAPI() failed";
363
-
364
- EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
365
- if (context == EGL_NO_CONTEXT)
366
- LOG(FATAL) << "eglCreateContext() failed";
367
-
368
- // Done.
369
-
370
- LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
371
- << std::hex << std::setfill('0')
372
- << std::setw(16) << (uintptr_t)display
373
- << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
374
-
375
- GLContext glctx = {display, context, 0};
376
- return glctx;
377
- }
378
-
379
- void destroyGLContext(GLContext& glctx)
380
- {
381
- if (!glctx.context)
382
- LOG(FATAL) << "destroyGLContext() called with null gltcx";
383
-
384
- // If this is the current context, release it.
385
- if (eglGetCurrentContext() == glctx.context)
386
- releaseGLContext();
387
-
388
- if (!eglDestroyContext(glctx.display, glctx.context))
389
- LOG(ERROR) << "eglDestroyContext() failed";
390
-
391
- LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
392
- << std::hex << std::setfill('0')
393
- << std::setw(16) << (uintptr_t)glctx.display
394
- << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
395
-
396
- memset(&glctx, 0, sizeof(GLContext));
397
- }
398
-
399
- //------------------------------------------------------------------------
400
-
401
- #endif // __linux__
402
-
403
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/glutil.h DELETED
@@ -1,113 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
-
11
- //------------------------------------------------------------------------
12
- // Windows-specific headers and types.
13
- //------------------------------------------------------------------------
14
-
15
- #ifdef _WIN32
16
- #define NOMINMAX
17
- #include <windows.h> // Required by gl.h in Windows.
18
- #define GLAPIENTRY APIENTRY
19
-
20
- struct GLContext
21
- {
22
- HDC hdc;
23
- HGLRC hglrc;
24
- int extInitialized;
25
- };
26
-
27
- #endif // _WIN32
28
-
29
- //------------------------------------------------------------------------
30
- // Linux-specific headers and types.
31
- //------------------------------------------------------------------------
32
-
33
- #ifdef __linux__
34
- #define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
35
- #define MESA_EGL_NO_X11_HEADERS
36
- #include <EGL/egl.h>
37
- #include <EGL/eglext.h>
38
- #define GLAPIENTRY
39
-
40
- struct GLContext
41
- {
42
- EGLDisplay display;
43
- EGLContext context;
44
- int extInitialized;
45
- };
46
-
47
- #endif // __linux__
48
-
49
- //------------------------------------------------------------------------
50
- // OpenGL, CUDA interop, GL extensions.
51
- //------------------------------------------------------------------------
52
- #define GL_GLEXT_LEGACY
53
- #include <GL/gl.h>
54
- #include <cuda_gl_interop.h>
55
-
56
- // Constants.
57
- #ifndef GL_VERSION_1_2
58
- #define GL_CLAMP_TO_EDGE 0x812F
59
- #define GL_TEXTURE_3D 0x806F
60
- #endif
61
- #ifndef GL_VERSION_1_5
62
- #define GL_ARRAY_BUFFER 0x8892
63
- #define GL_DYNAMIC_DRAW 0x88E8
64
- #define GL_ELEMENT_ARRAY_BUFFER 0x8893
65
- #endif
66
- #ifndef GL_VERSION_2_0
67
- #define GL_FRAGMENT_SHADER 0x8B30
68
- #define GL_INFO_LOG_LENGTH 0x8B84
69
- #define GL_LINK_STATUS 0x8B82
70
- #define GL_VERTEX_SHADER 0x8B31
71
- #endif
72
- #ifndef GL_VERSION_3_0
73
- #define GL_MAJOR_VERSION 0x821B
74
- #define GL_MINOR_VERSION 0x821C
75
- #define GL_RGBA32F 0x8814
76
- #define GL_TEXTURE_2D_ARRAY 0x8C1A
77
- #endif
78
- #ifndef GL_VERSION_3_2
79
- #define GL_GEOMETRY_SHADER 0x8DD9
80
- #endif
81
- #ifndef GL_ARB_framebuffer_object
82
- #define GL_COLOR_ATTACHMENT0 0x8CE0
83
- #define GL_COLOR_ATTACHMENT1 0x8CE1
84
- #define GL_DEPTH_STENCIL 0x84F9
85
- #define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
86
- #define GL_DEPTH24_STENCIL8 0x88F0
87
- #define GL_FRAMEBUFFER 0x8D40
88
- #define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
89
- #define GL_UNSIGNED_INT_24_8 0x84FA
90
- #endif
91
- #ifndef GL_ARB_imaging
92
- #define GL_TABLE_TOO_LARGE 0x8031
93
- #endif
94
- #ifndef GL_KHR_robustness
95
- #define GL_CONTEXT_LOST 0x0507
96
- #endif
97
-
98
- // Declare function pointers to OpenGL extension functions.
99
- #define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
100
- #include "glutil_extlist.h"
101
- #undef GLUTIL_EXT
102
-
103
- //------------------------------------------------------------------------
104
- // Common functions.
105
- //------------------------------------------------------------------------
106
-
107
- void setGLContext (GLContext& glctx);
108
- void releaseGLContext (void);
109
- GLContext createGLContext (int cudaDeviceIdx);
110
- void destroyGLContext (GLContext& glctx);
111
- const char* getGLErrorString (GLenum err);
112
-
113
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extensions/nvdiffrast/nvdiffrast/common/glutil_extlist.h DELETED
@@ -1,48 +0,0 @@
1
- // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #ifndef GL_VERSION_1_2
10
- GLUTIL_EXT(void, glTexImage3D, GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
11
- #endif
12
- #ifndef GL_VERSION_1_5
13
- GLUTIL_EXT(void, glBindBuffer, GLenum target, GLuint buffer);
14
- GLUTIL_EXT(void, glBufferData, GLenum target, ptrdiff_t size, const void* data, GLenum usage);
15
- GLUTIL_EXT(void, glGenBuffers, GLsizei n, GLuint* buffers);
16
- #endif
17
- #ifndef GL_VERSION_2_0
18
- GLUTIL_EXT(void, glAttachShader, GLuint program, GLuint shader);
19
- GLUTIL_EXT(void, glCompileShader, GLuint shader);
20
- GLUTIL_EXT(GLuint, glCreateProgram, void);
21
- GLUTIL_EXT(GLuint, glCreateShader, GLenum type);
22
- GLUTIL_EXT(void, glDrawBuffers, GLsizei n, const GLenum* bufs);
23
- GLUTIL_EXT(void, glEnableVertexAttribArray, GLuint index);
24
- GLUTIL_EXT(void, glGetProgramInfoLog, GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
25
- GLUTIL_EXT(void, glGetProgramiv, GLuint program, GLenum pname, GLint* param);
26
- GLUTIL_EXT(void, glLinkProgram, GLuint program);
27
- GLUTIL_EXT(void, glShaderSource, GLuint shader, GLsizei count, const char *const* string, const GLint* length);
28
- GLUTIL_EXT(void, glUniform1f, GLint location, GLfloat v0);
29
- GLUTIL_EXT(void, glUniform2f, GLint location, GLfloat v0, GLfloat v1);
30
- GLUTIL_EXT(void, glUseProgram, GLuint program);
31
- GLUTIL_EXT(void, glVertexAttribPointer, GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
32
- #endif
33
- #ifndef GL_VERSION_3_2
34
- GLUTIL_EXT(void, glFramebufferTexture, GLenum target, GLenum attachment, GLuint texture, GLint level);
35
- #endif
36
- #ifndef GL_ARB_framebuffer_object
37
- GLUTIL_EXT(void, glBindFramebuffer, GLenum target, GLuint framebuffer);
38
- GLUTIL_EXT(void, glGenFramebuffers, GLsizei n, GLuint* framebuffers);
39
- #endif
40
- #ifndef GL_ARB_vertex_array_object
41
- GLUTIL_EXT(void, glBindVertexArray, GLuint array);
42
- GLUTIL_EXT(void, glGenVertexArrays, GLsizei n, GLuint* arrays);
43
- #endif
44
- #ifndef GL_ARB_multi_draw_indirect
45
- GLUTIL_EXT(void, glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
46
- #endif
47
-
48
- //------------------------------------------------------------------------