Spaces:
Running on Zero
Running on Zero
Commit ·
e6de801
1
Parent(s): 52b092a
Migrate from TRELLIS v1 to TRELLIS.2 pipeline
Browse files- Replace trellis/ with trellis2/ (3-stage: sparse structure -> shape -> texture)
- Model: microsoft/TRELLIS.2-4B (upgraded from TRELLIS-image-large)
- PyTorch 2.6.0 + CUDA 12.4, flash_attn_3, o_voxel, cumesh, flex_gemm
- GLB export via o_voxel.postprocess.to_glb with PBR materials
- Add HDRI envmaps for shaded rendering (forest, sunset, courtyard)
- Keep existing Gradio UI: LitModel3D, video preview, STL export
- Remove old extensions/, wheels/, Dockerfile (Gradio SDK space)
Co-authored-by: Cursor <cursoragent@cursor.com>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +9 -1
- app.py +261 -546
- wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl → assets/app/basecolor.png +2 -2
- assets/app/clay.png +3 -0
- assets/app/hdri_city.png +3 -0
- assets/app/hdri_courtyard.png +3 -0
- assets/app/hdri_forest.png +3 -0
- assets/app/hdri_interior.png +3 -0
- assets/app/hdri_night.png +3 -0
- assets/app/hdri_studio.png +3 -0
- assets/app/hdri_sunrise.png +3 -0
- assets/app/hdri_sunset.png +3 -0
- assets/app/normal.png +3 -0
- assets/hdri/city.exr +3 -0
- assets/hdri/courtyard.exr +3 -0
- assets/hdri/forest.exr +3 -0
- assets/hdri/interior.exr +3 -0
- assets/hdri/license.txt +15 -0
- assets/hdri/night.exr +3 -0
- assets/hdri/studio.exr +3 -0
- assets/hdri/sunrise.exr +3 -0
- assets/hdri/sunset.exr +3 -0
- autotune_cache.json +0 -0
- extensions/nvdiffrast/LICENSE.txt +0 -97
- extensions/nvdiffrast/README.md +0 -42
- extensions/nvdiffrast/nvdiffrast/__init__.py +0 -9
- extensions/nvdiffrast/nvdiffrast/common/antialias.cu +0 -558
- extensions/nvdiffrast/nvdiffrast/common/antialias.h +0 -50
- extensions/nvdiffrast/nvdiffrast/common/common.cpp +0 -60
- extensions/nvdiffrast/nvdiffrast/common/common.h +0 -263
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp +0 -63
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl +0 -423
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp +0 -94
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp +0 -55
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl +0 -730
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp +0 -73
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp +0 -79
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp +0 -90
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl +0 -385
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp +0 -153
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp +0 -370
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp +0 -102
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl_.cu +0 -37
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl +0 -402
- extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl +0 -452
- extensions/nvdiffrast/nvdiffrast/common/framework.h +0 -49
- extensions/nvdiffrast/nvdiffrast/common/glutil.cpp +0 -403
- extensions/nvdiffrast/nvdiffrast/common/glutil.h +0 -113
- extensions/nvdiffrast/nvdiffrast/common/glutil_extlist.h +0 -48
.gitattributes
CHANGED
|
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.exr filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -1,2 +1,10 @@
|
|
| 1 |
model_cache/
|
| 2 |
-
AGENTS.md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
model_cache/
|
| 2 |
+
AGENTS.md
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
cache/
|
| 6 |
+
tmp/
|
| 7 |
+
trellis/
|
| 8 |
+
extensions/
|
| 9 |
+
wheels/
|
| 10 |
+
TRELLIS.2/
|
app.py
CHANGED
|
@@ -1,502 +1,293 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import os
|
| 3 |
-
print(f"[DIAG] app.py starting, __name__={__name__}, argv={sys.argv}", flush=True)
|
| 4 |
-
|
| 5 |
import argparse
|
|
|
|
|
|
|
| 6 |
import time
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
-
print(f"[DIAG] gradio imported, NO_RELOAD={gr.NO_RELOAD}", flush=True)
|
| 10 |
import spaces
|
| 11 |
from gradio_litmodel3d import LitModel3D
|
| 12 |
-
print("[DIAG] all top-level UI imports done", flush=True)
|
| 13 |
sys.path.append(os.getcwd())
|
| 14 |
-
|
| 15 |
-
os.environ['SPCONV_ALGO'] = 'native'
|
| 16 |
-
os.environ['TORCH_CUDA_ARCH_LIST'] = '8.9'
|
| 17 |
-
import concurrent.futures
|
| 18 |
from typing import *
|
| 19 |
-
print("[DIAG] importing torch...", flush=True)
|
| 20 |
import torch
|
| 21 |
-
print(f"[DIAG] torch imported, cuda.is_available={torch.cuda.is_available()}", flush=True)
|
| 22 |
import numpy as np
|
| 23 |
import imageio
|
| 24 |
from PIL import Image
|
| 25 |
import trimesh
|
| 26 |
-
|
| 27 |
from datetime import datetime
|
| 28 |
import logging
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
|
| 33 |
-
# --- Command Line Args ---
|
| 34 |
-
print("[DIAG] setting up argparse...", flush=True)
|
| 35 |
parser = argparse.ArgumentParser(description="Pocket 3D AI 2")
|
| 36 |
-
parser.add_argument("--prod",
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
parser.add_argument("--port",
|
| 40 |
-
type=int,
|
| 41 |
-
help="Port to run the server on (default: 8081 for prod, 8080 for dev)")
|
| 42 |
-
cmd_args, unknown_args = parser.parse_known_args()
|
| 43 |
-
if unknown_args:
|
| 44 |
-
print(f"[DIAG] WARNING: unknown args ignored: {unknown_args}", flush=True)
|
| 45 |
-
print(f"[DIAG] argparse done, prod={cmd_args.prod}, port={cmd_args.port}", flush=True)
|
| 46 |
|
| 47 |
prod = cmd_args.prod
|
| 48 |
port = cmd_args.port if cmd_args.port else (8081 if prod else 8080)
|
| 49 |
show_options = not prod
|
| 50 |
|
| 51 |
MAX_SEED = np.iinfo(np.int32).max
|
| 52 |
-
|
| 53 |
TMP_DIR = os.path.join('cache')
|
| 54 |
os.makedirs(TMP_DIR, exist_ok=True)
|
| 55 |
|
| 56 |
-
print(f"[DIAG] entering gr.NO_RELOAD block (NO_RELOAD={gr.NO_RELOAD})...", flush=True)
|
| 57 |
if gr.NO_RELOAD:
|
| 58 |
-
print("[DIAG] importing trellis pipeline...", flush=True)
|
| 59 |
-
from trellis.pipelines.trellis_image_to_3d import TrellisImageTo3DPipeline
|
| 60 |
-
print("[DIAG] importing trellis utils...", flush=True)
|
| 61 |
-
from trellis.utils import render_utils, postprocessing_utils
|
| 62 |
-
print("[DIAG] trellis imports done", flush=True)
|
| 63 |
pipeline = None
|
|
|
|
| 64 |
|
| 65 |
def initialize_pipeline():
|
| 66 |
-
global pipeline
|
| 67 |
if pipeline is not None:
|
| 68 |
-
logger.info("Pipeline already initialized.")
|
| 69 |
return
|
| 70 |
|
| 71 |
-
logger.info("Initializing pipeline...")
|
| 72 |
start_time = time.time()
|
| 73 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 74 |
-
target_dtype = torch.float16 if device.type == 'cuda' else torch.float32
|
| 75 |
-
logger.info(f"Target device: {device}, Target dtype: {target_dtype}")
|
| 76 |
|
| 77 |
try:
|
| 78 |
-
|
| 79 |
-
pipeline =
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
if rmbg_device.type != 'cpu':
|
| 94 |
-
logger.info(f"Moving RMBG model from {rmbg_device} to CPU...")
|
| 95 |
-
pipeline.rmbg_model.to('cpu')
|
| 96 |
-
|
| 97 |
-
logger.info(f"⏰ Pipeline initialized and confirmed on CPU in {time.time() - start_time:.2f} seconds.")
|
| 98 |
-
|
| 99 |
except Exception as e:
|
| 100 |
logger.error(f"Failed to initialize pipeline: {e}", exc_info=True)
|
| 101 |
-
pipeline = None
|
| 102 |
-
raise
|
| 103 |
-
|
| 104 |
|
| 105 |
initialize_pipeline()
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
def start_session(req: gr.Request):
|
| 110 |
-
# user_dir = os.path.join(TMP_DIR, str(req.session_hash)) # Per-session dir if needed
|
| 111 |
torch.cuda.empty_cache()
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
def end_session(req: gr.Request):
|
| 116 |
torch.cuda.empty_cache()
|
| 117 |
|
|
|
|
| 118 |
def preprocess_image(image: Optional[Image.Image]) -> Optional[Image.Image]:
|
| 119 |
if image is None:
|
| 120 |
-
logger.warning("Preprocess: received None image.")
|
| 121 |
return None
|
| 122 |
-
user_dir = TMP_DIR
|
| 123 |
-
current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 124 |
-
image_path = os.path.join(user_dir, f'{current_time}.png')
|
| 125 |
-
image.save(image_path)
|
| 126 |
try:
|
| 127 |
-
|
| 128 |
-
return processed_image
|
| 129 |
except Exception as e:
|
| 130 |
logger.error(f"Error during image preprocessing: {e}", exc_info=True)
|
| 131 |
-
return None
|
| 132 |
-
|
| 133 |
-
def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
|
| 134 |
-
images = [image[0] for image in images]
|
| 135 |
-
processed_images = pipeline.preprocess_images(images)
|
| 136 |
-
return processed_images
|
| 137 |
|
| 138 |
def get_seed(randomize_seed: bool, seed: int) -> int:
|
| 139 |
-
if randomize_seed
|
| 140 |
-
new_seed = np.random.randint(0, MAX_SEED)
|
| 141 |
-
return new_seed
|
| 142 |
-
else:
|
| 143 |
-
return seed
|
| 144 |
|
| 145 |
-
# --- Core Logic Functions ---
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
|
|
|
| 149 |
seed: int,
|
|
|
|
| 150 |
ss_guidance_strength: float,
|
| 151 |
ss_sampling_steps: int,
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
progress=gr.Progress(track_tqdm=True)
|
| 155 |
-
) ->
|
| 156 |
if image is None or pipeline is None:
|
| 157 |
-
|
| 158 |
-
return None
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
|
| 163 |
-
outputs = pipeline.run(
|
| 164 |
-
image,
|
| 165 |
-
seed=seed,
|
| 166 |
-
formats=["gaussian", "mesh"],
|
| 167 |
-
preprocess_image=False,
|
| 168 |
-
sparse_structure_sampler_params={
|
| 169 |
-
"steps": ss_sampling_steps,
|
| 170 |
-
"cfg_strength": ss_guidance_strength,
|
| 171 |
-
},
|
| 172 |
-
slat_sampler_params={
|
| 173 |
-
"steps": slat_sampling_steps,
|
| 174 |
-
"cfg_strength": slat_guidance_strength,
|
| 175 |
-
},
|
| 176 |
-
)
|
| 177 |
-
logger.info(f"⌚ Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
|
| 178 |
-
return outputs
|
| 179 |
-
except Exception as e:
|
| 180 |
-
logger.error(f"Error during pipeline run: {e}", exc_info=True)
|
| 181 |
-
torch.cuda.empty_cache()
|
| 182 |
-
return None
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
ss_sampling_steps: int,
|
| 189 |
-
slat_guidance_strength: float,
|
| 190 |
-
slat_sampling_steps: int,
|
| 191 |
-
multiimage_algo: Literal["multidiffusion", "stochastic"],
|
| 192 |
-
progress=gr.Progress(track_tqdm=True)
|
| 193 |
-
) -> Optional[dict]:
|
| 194 |
-
if not images or pipeline is None:
|
| 195 |
-
logger.error("Generate 3D Data Multi-Image: called with empty images or uninitialized pipeline.")
|
| 196 |
-
return None
|
| 197 |
|
|
|
|
| 198 |
pipeline_start = time.time()
|
| 199 |
|
| 200 |
try:
|
| 201 |
-
outputs = pipeline.
|
| 202 |
-
|
| 203 |
seed=seed,
|
| 204 |
-
formats=["gaussian", "mesh"],
|
| 205 |
preprocess_image=False,
|
| 206 |
sparse_structure_sampler_params={
|
| 207 |
"steps": ss_sampling_steps,
|
| 208 |
-
"
|
| 209 |
},
|
| 210 |
-
|
| 211 |
-
"steps":
|
| 212 |
-
"
|
| 213 |
},
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
)
|
| 216 |
-
logger.info(f"⌚ Multi-Image Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
|
| 217 |
-
return outputs
|
| 218 |
except Exception as e:
|
| 219 |
-
logger.error(f"Error during
|
| 220 |
torch.cuda.empty_cache()
|
| 221 |
-
return
|
| 222 |
|
|
|
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
is_mobile: bool,
|
| 228 |
-
user_dir: str,
|
| 229 |
-
progress=gr.Progress(track_tqdm=True) # Keep progress for potential future use or consistency
|
| 230 |
-
) -> Optional[str]:
|
| 231 |
-
"""
|
| 232 |
-
Combines pre-rendered color and normal video frames and saves the result.
|
| 233 |
-
"""
|
| 234 |
-
if not video_color_frames or not video_normal_frames:
|
| 235 |
-
logger.error("Combine Video: received empty frame lists.")
|
| 236 |
-
return None
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
try:
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
combined_video = [np.concatenate([color_frame, normal_frame], axis=0) for color_frame, normal_frame in zip(video_color_frames, video_normal_frames)]
|
| 246 |
-
else:
|
| 247 |
-
# Horizontal video (side by side)
|
| 248 |
-
combined_video = [np.concatenate([color_frame, normal_frame], axis=1) for color_frame, normal_frame in zip(video_color_frames, video_normal_frames)]
|
| 249 |
-
|
| 250 |
-
# Save video
|
| 251 |
-
current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 252 |
-
video_path = os.path.join(user_dir, f'{current_time}.mp4')
|
| 253 |
-
# Use a thread for saving to avoid blocking if I/O is slow
|
| 254 |
-
# Note: imageio.mimsave might release GIL for some codecs/operations
|
| 255 |
-
imageio.mimsave(video_path, combined_video, fps=15)
|
| 256 |
-
|
| 257 |
-
return video_path
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
except Exception as e:
|
| 260 |
-
logger.error(f"
|
| 261 |
-
return None
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
def extract_glb(
|
| 265 |
-
outputs: dict,
|
| 266 |
-
mesh_simplify: float,
|
| 267 |
-
texture_size: int,
|
| 268 |
-
progress=gr.Progress(track_tqdm=True)
|
| 269 |
-
) -> Optional[Tuple[str, str]]: # MODIFIED return type
|
| 270 |
-
"""
|
| 271 |
-
Extract a GLB file from the 3D model outputs and convert to STL.
|
| 272 |
-
(Modified to return GLB and STL paths)
|
| 273 |
-
"""
|
| 274 |
-
if outputs is None or 'gaussian' not in outputs or 'mesh' not in outputs:
|
| 275 |
-
logger.error("Extract GLB: received invalid outputs.")
|
| 276 |
-
return None, None # MODIFIED return
|
| 277 |
-
|
| 278 |
-
glb_start_time = time.time() # Renamed for clarity
|
| 279 |
-
user_dir = TMP_DIR
|
| 280 |
-
glb_path: Optional[str] = None
|
| 281 |
-
stl_path: Optional[str] = None
|
| 282 |
|
|
|
|
| 283 |
try:
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
texture_size=texture_size,
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
| 292 |
)
|
|
|
|
| 293 |
current_time_glb = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 294 |
-
glb_path = os.path.join(
|
| 295 |
-
|
| 296 |
-
logger.info(f"
|
| 297 |
|
| 298 |
-
|
| 299 |
try:
|
| 300 |
mesh_data = trimesh.load_mesh(glb_path, force='mesh')
|
| 301 |
mesh_to_export = None
|
| 302 |
|
| 303 |
if isinstance(mesh_data, trimesh.Scene):
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
logger.warning(f"Loaded object from {glb_path} of type {type(mesh_data)} is not a Trimesh or Scene.")
|
| 327 |
-
|
| 328 |
-
if mesh_to_export:
|
| 329 |
-
if not (mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0):
|
| 330 |
-
logger.warning(f"Mesh for STL export from {glb_path} has no faces. Attempting to convex hull.")
|
| 331 |
-
if mesh_to_export.vertices is not None and len(mesh_to_export.vertices) >= 4:
|
| 332 |
-
try:
|
| 333 |
-
mesh_to_export = mesh_to_export.convex_hull
|
| 334 |
-
except Exception as convex_e:
|
| 335 |
-
logger.error(f"Failed to create convex hull for {glb_path}: {convex_e}")
|
| 336 |
-
mesh_to_export = None
|
| 337 |
-
else:
|
| 338 |
-
mesh_to_export = None
|
| 339 |
-
|
| 340 |
-
if mesh_to_export and mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0:
|
| 341 |
-
current_time_stl = datetime.now().strftime("%Y-%m%d-%H%M%S-%f")
|
| 342 |
-
stl_path = os.path.join(user_dir, f'{current_time_stl}.stl')
|
| 343 |
-
if mesh_to_export:
|
| 344 |
-
mesh_to_export = mesh_to_export.copy()
|
| 345 |
-
# Y-up to Z-up (upright, not upside down)
|
| 346 |
-
rot_x_90 = trimesh.transformations.rotation_matrix(np.deg2rad(90), [1, 0, 0])
|
| 347 |
-
mesh_to_export.apply_transform(rot_x_90)
|
| 348 |
-
bbox = mesh_to_export.bounds
|
| 349 |
-
current_size = (bbox[1] - bbox[0]).max()
|
| 350 |
-
target_size_mm = 152.4 # 6 inches
|
| 351 |
-
if current_size > 0:
|
| 352 |
-
scale_factor = target_size_mm / current_size
|
| 353 |
-
mesh_to_export.vertices *= scale_factor
|
| 354 |
-
mesh_to_export.export(stl_path)
|
| 355 |
-
logger.info(f"⌚ STL Export Time: {time.time() - stl_export_start_time:.2f} seconds. Saved to {stl_path}")
|
| 356 |
-
elif mesh_to_export:
|
| 357 |
-
logger.error(f"Failed to prepare mesh with faces from {glb_path} for STL export.")
|
| 358 |
-
stl_path = None
|
| 359 |
-
else:
|
| 360 |
-
logger.error(f"No valid mesh could be processed from {glb_path} for STL export.")
|
| 361 |
-
stl_path = None
|
| 362 |
-
|
| 363 |
except Exception as stl_e:
|
| 364 |
-
logger.error(f"
|
| 365 |
-
stl_path = None
|
| 366 |
-
|
| 367 |
-
return glb_path, stl_path
|
| 368 |
-
|
| 369 |
-
except Exception as e:
|
| 370 |
-
logger.error(f"Error during GLB/STL extraction: {e}", exc_info=True)
|
| 371 |
-
return None, None
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
@spaces.GPU(duration=120)
|
| 375 |
-
def process_image_concurrently_yielding(
|
| 376 |
-
image: Optional[Image.Image],
|
| 377 |
-
multiimages: List[Tuple[Image.Image, str]],
|
| 378 |
-
is_multiimage: bool,
|
| 379 |
-
seed: int,
|
| 380 |
-
ss_guidance_strength: float,
|
| 381 |
-
ss_sampling_steps: int,
|
| 382 |
-
slat_guidance_strength: float,
|
| 383 |
-
slat_sampling_steps: int,
|
| 384 |
-
mesh_simplify: float,
|
| 385 |
-
texture_size: int,
|
| 386 |
-
multiimage_algo: Literal["multidiffusion", "stochastic"],
|
| 387 |
-
req: gr.Request,
|
| 388 |
-
progress=gr.Progress(track_tqdm=True)
|
| 389 |
-
) -> Generator[Tuple[Optional[str], Optional[str], Dict[str, Any]], None, None]:
|
| 390 |
-
video_path: Optional[str] = None
|
| 391 |
-
glb_path: Optional[str] = None
|
| 392 |
-
stl_path: Optional[str] = None
|
| 393 |
-
color_frames: Optional[List[np.ndarray]] = None
|
| 394 |
-
normal_frames: Optional[List[np.ndarray]] = None
|
| 395 |
-
|
| 396 |
-
# Move all models to real GPU now that @spaces.GPU has allocated one
|
| 397 |
-
logger.info("Moving pipeline models to GPU...")
|
| 398 |
-
move_start = time.time()
|
| 399 |
-
pipeline.cuda()
|
| 400 |
-
if hasattr(pipeline, 'rmbg_model') and pipeline.rmbg_model is not None:
|
| 401 |
-
pipeline.rmbg_model.to('cuda')
|
| 402 |
-
logger.info(f"Models moved to GPU in {time.time() - move_start:.2f}s")
|
| 403 |
|
| 404 |
-
|
|
|
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
if image is None:
|
| 409 |
-
logger.error("Received None image.")
|
| 410 |
-
return
|
| 411 |
-
logger.info("Preprocessing single image inside GPU session...")
|
| 412 |
-
progress(0, desc="Removing background...")
|
| 413 |
-
image = preprocess_image(image)
|
| 414 |
-
if image is None:
|
| 415 |
-
logger.error("Image preprocessing failed.")
|
| 416 |
-
return
|
| 417 |
-
logger.info("Starting Pipeline (Single Image)...")
|
| 418 |
-
outputs = generate_3d_data(
|
| 419 |
-
image, seed, ss_guidance_strength, ss_sampling_steps,
|
| 420 |
-
slat_guidance_strength, slat_sampling_steps, progress=progress
|
| 421 |
-
)
|
| 422 |
-
else:
|
| 423 |
-
if not multiimages:
|
| 424 |
-
logger.error("Received empty multiimages list.")
|
| 425 |
-
return
|
| 426 |
-
logger.info(f"Preprocessing and starting Pipeline (Multi-Image: {len(multiimages)} images)...")
|
| 427 |
-
progress(0, desc="Preprocessing images...")
|
| 428 |
-
processed_images = preprocess_images(multiimages)
|
| 429 |
-
progress(0.1, desc="Generating 3D Structure...")
|
| 430 |
-
outputs = generate_3d_data_multi_image(
|
| 431 |
-
processed_images, seed, ss_guidance_strength, ss_sampling_steps,
|
| 432 |
-
slat_guidance_strength, slat_sampling_steps, multiimage_algo, progress=progress
|
| 433 |
-
)
|
| 434 |
-
|
| 435 |
-
if outputs is None:
|
| 436 |
-
logger.error("Failed to generate 3D data. Aborting.")
|
| 437 |
-
return
|
| 438 |
|
| 439 |
-
# --- Step 2: Determine Render Settings ---
|
| 440 |
-
user_agent = req.headers.get("User-Agent", "").lower()
|
| 441 |
-
is_mobile = any(device in user_agent for device in ["android", "iphone", "ipad", "mobile"])
|
| 442 |
-
resolution = 256 if is_mobile else 384
|
| 443 |
-
num_frames = 45 # Consistent frame count
|
| 444 |
-
|
| 445 |
-
# --- Step 3: Render Videos First ---
|
| 446 |
-
vid_time = time.time()
|
| 447 |
-
|
| 448 |
-
progress(0.4, desc="Rendering Preview Videos...")
|
| 449 |
-
logger.info("Rendering videos: color and normal")
|
| 450 |
-
|
| 451 |
-
try:
|
| 452 |
-
color_result = render_utils.render_video(outputs['gaussian'][0], resolution=resolution, num_frames=num_frames, mode='color', verbose=False)
|
| 453 |
-
normal_result = render_utils.render_video(outputs['mesh'][0], resolution=resolution, num_frames=num_frames, mode='normal', verbose=False)
|
| 454 |
-
|
| 455 |
-
if color_result and 'color' in color_result:
|
| 456 |
-
color_frames = color_result['color']
|
| 457 |
-
else:
|
| 458 |
-
logger.warning("Color video rendering returned invalid data.")
|
| 459 |
-
color_frames = []
|
| 460 |
-
|
| 461 |
-
if normal_result and 'normal' in normal_result:
|
| 462 |
-
normal_frames = normal_result['normal']
|
| 463 |
-
else:
|
| 464 |
-
logger.warning("Normal video rendering returned invalid data.")
|
| 465 |
-
normal_frames = []
|
| 466 |
-
|
| 467 |
-
except Exception as exc:
|
| 468 |
-
logger.error(f"Video rendering generated an exception: {exc}", exc_info=True)
|
| 469 |
-
color_frames = []
|
| 470 |
-
normal_frames = []
|
| 471 |
-
|
| 472 |
-
if color_frames and normal_frames:
|
| 473 |
-
video_path = combine_and_save_video(color_frames, normal_frames, is_mobile, TMP_DIR, progress=progress)
|
| 474 |
-
if video_path:
|
| 475 |
-
logger.info(f"✅ Video Time: {time.time() - vid_time:.2f} seconds")
|
| 476 |
-
yield video_path, None, gr.update(value=None, interactive=False)
|
| 477 |
-
else:
|
| 478 |
-
logger.warning("Video combination/saving failed.")
|
| 479 |
-
|
| 480 |
-
# --- Step 4: Extract GLB/STL After Video ---
|
| 481 |
-
try:
|
| 482 |
-
progress(0.7, desc="Finalizing 3D Model & Textures...")
|
| 483 |
-
glb_stl_result = extract_glb(outputs, mesh_simplify, texture_size, progress=progress)
|
| 484 |
-
if glb_stl_result and isinstance(glb_stl_result, tuple) and len(glb_stl_result) == 2:
|
| 485 |
-
glb_path, stl_path = glb_stl_result
|
| 486 |
-
if not glb_path:
|
| 487 |
-
logger.warning("GLB extraction returned None.")
|
| 488 |
-
if not stl_path:
|
| 489 |
-
logger.warning("STL extraction returned None.")
|
| 490 |
-
else:
|
| 491 |
-
logger.warning(f"GLB/STL extraction returned invalid data: {glb_stl_result}")
|
| 492 |
-
glb_path, stl_path = None, None
|
| 493 |
-
except Exception as exc:
|
| 494 |
-
logger.error(f"GLB/STL extraction generated an exception: {exc}", exc_info=True)
|
| 495 |
-
glb_path, stl_path = None, None
|
| 496 |
-
|
| 497 |
-
stl_button_update = gr.update(value=stl_path, interactive=True) if stl_path else gr.update(value=None, interactive=False)
|
| 498 |
-
yield video_path, glb_path, stl_button_update
|
| 499 |
-
|
| 500 |
torch.cuda.empty_cache()
|
| 501 |
|
| 502 |
|
|
@@ -504,57 +295,50 @@ css = """
|
|
| 504 |
h1, h2, h3 { text-align: center; display: block; }
|
| 505 |
footer { visibility: hidden; }
|
| 506 |
.gradio-container { max-width: 1024px !important; }
|
| 507 |
-
/* Base styles */
|
| 508 |
.gr-image-container { display: flex !important; justify-content: center !important; align-items: center !important; width: 100%; height: 240px; }
|
| 509 |
.gr-image-container img { width: 100%; height: 100%; object-fit: contain; object-position: center; }
|
| 510 |
-
/* Desktop styles */
|
| 511 |
@media screen and (min-width: 768px) {
|
| 512 |
.gr-image-container { height: 360px !important; }
|
| 513 |
-
.video-container { height: 360px !important; max-width: 680px !important; margin: 0 auto !important; aspect-ratio: auto !important;
|
| 514 |
.model-container { height: 480px !important; max-width: 680px !important; margin: 0 auto !important; }
|
| 515 |
}
|
| 516 |
.custom-header { display: flex; align-items: center; height: 100%; }
|
| 517 |
"""
|
| 518 |
|
| 519 |
with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
|
|
|
|
|
|
|
| 520 |
with gr.Row(equal_height=True):
|
| 521 |
gr.Image("assets/sb_pocket_logo_dark.png", show_label=False, container=False, show_download_button=False, min_width=50, interactive=False, show_fullscreen_button=False)
|
| 522 |
|
| 523 |
with gr.Column():
|
| 524 |
with gr.Row():
|
| 525 |
with gr.Column(scale=2, min_width=100, variant="default"):
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
)
|
| 539 |
-
with gr.Tab(label="Multiple Images", id=1) as multiimage_input_tab:
|
| 540 |
-
multiimage_prompt = gr.Gallery(label="Images", format="png", type="pil", height=240, columns=3)
|
| 541 |
-
gr.Markdown("""
|
| 542 |
-
Input different views of the object in separate images.
|
| 543 |
-
|
| 544 |
-
*NOTE: this is experimental and may not produce the best results for all images.*
|
| 545 |
-
""")
|
| 546 |
-
multi_image_process_Button = gr.Button(value="Process Images", visible=True, interactive=True, size="lg", variant="primary")
|
| 547 |
-
|
| 548 |
|
| 549 |
with gr.Column(scale=5, min_width=100):
|
| 550 |
-
video_output = gr.Video(
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
| 558 |
with gr.Row(equal_height=False):
|
| 559 |
with gr.Column(scale=2, min_width=100, variant="default"):
|
| 560 |
examples = gr.Examples(
|
|
@@ -562,50 +346,56 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
|
|
| 562 |
f'./assets/example_image/{image}'
|
| 563 |
for image in os.listdir("./assets/example_image")
|
| 564 |
],
|
| 565 |
-
inputs=[image_prompt],
|
| 566 |
examples_per_page=9,
|
| 567 |
)
|
| 568 |
with gr.Column(scale=5):
|
| 569 |
-
model_output = LitModel3D(
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
|
|
|
|
|
|
| 577 |
stl_download_button = gr.DownloadButton(label="Download STL", visible=False, interactive=False, size="lg", variant="primary")
|
| 578 |
|
| 579 |
with gr.Accordion(label="Generation Settings", open=False, visible=not prod):
|
| 580 |
seed_slider = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
|
| 581 |
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
|
|
|
|
| 582 |
gr.Markdown("Stage 1: Sparse Structure Generation")
|
| 583 |
with gr.Row():
|
| 584 |
-
ss_guidance_strength = gr.Slider(
|
| 585 |
ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
|
| 586 |
-
gr.Markdown("Stage 2:
|
| 587 |
with gr.Row():
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
|
|
|
|
|
|
|
|
|
| 591 |
|
| 592 |
with gr.Accordion(label="GLB Extraction Settings", open=False, visible=not prod):
|
| 593 |
-
mesh_simplify = gr.Slider(
|
| 594 |
-
texture_size = gr.Slider(
|
| 595 |
-
|
| 596 |
-
is_multiimage = gr.State(False)
|
| 597 |
|
| 598 |
demo.load(start_session)
|
| 599 |
demo.unload(end_session)
|
| 600 |
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
|
|
|
|
|
|
| 609 |
|
| 610 |
image_prompt.upload(
|
| 611 |
get_seed,
|
|
@@ -614,74 +404,13 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
|
|
| 614 |
show_progress="hidden",
|
| 615 |
trigger_mode="always_last"
|
| 616 |
).then(
|
| 617 |
-
fn=
|
| 618 |
-
inputs=
|
| 619 |
-
|
| 620 |
-
multiimage_prompt,
|
| 621 |
-
is_multiimage,
|
| 622 |
-
seed_slider,
|
| 623 |
-
ss_guidance_strength, ss_sampling_steps,
|
| 624 |
-
slat_guidance_strength, slat_sampling_steps,
|
| 625 |
-
mesh_simplify, texture_size,
|
| 626 |
-
multiimage_algo
|
| 627 |
-
],
|
| 628 |
-
outputs=[video_output, model_output, stl_download_button],
|
| 629 |
show_progress="hidden",
|
| 630 |
scroll_to_output=True,
|
| 631 |
)
|
| 632 |
|
| 633 |
-
multi_image_process_Button.click(
|
| 634 |
-
get_seed,
|
| 635 |
-
inputs=[randomize_seed, seed_slider],
|
| 636 |
-
outputs=[seed_slider],
|
| 637 |
-
show_progress="hidden",
|
| 638 |
-
trigger_mode="always_last"
|
| 639 |
-
).then(
|
| 640 |
-
fn=process_image_concurrently_yielding,
|
| 641 |
-
inputs=[
|
| 642 |
-
image_prompt,
|
| 643 |
-
multiimage_prompt,
|
| 644 |
-
is_multiimage,
|
| 645 |
-
seed_slider,
|
| 646 |
-
ss_guidance_strength, ss_sampling_steps,
|
| 647 |
-
slat_guidance_strength, slat_sampling_steps,
|
| 648 |
-
mesh_simplify, texture_size,
|
| 649 |
-
multiimage_algo
|
| 650 |
-
],
|
| 651 |
-
outputs=[video_output, model_output, stl_download_button],
|
| 652 |
-
show_progress="hidden",
|
| 653 |
-
scroll_to_output=True,
|
| 654 |
-
)
|
| 655 |
-
|
| 656 |
-
# multiimage_prompt.upload(
|
| 657 |
-
# preprocess_images,
|
| 658 |
-
# inputs=[multiimage_prompt],
|
| 659 |
-
# outputs=[multiimage_prompt],
|
| 660 |
-
# show_progress="minimal",
|
| 661 |
-
# ).then(
|
| 662 |
-
# get_seed,
|
| 663 |
-
# inputs=[randomize_seed, seed_slider],
|
| 664 |
-
# outputs=[seed_slider],
|
| 665 |
-
# show_progress="hidden",
|
| 666 |
-
# trigger_mode="always_last"
|
| 667 |
-
# ).then(
|
| 668 |
-
# fn=process_image_concurrently_yielding,
|
| 669 |
-
# inputs=[
|
| 670 |
-
# image_prompt,
|
| 671 |
-
# multiimage_prompt,
|
| 672 |
-
# is_multiimage,
|
| 673 |
-
# seed_slider,
|
| 674 |
-
# ss_guidance_strength, ss_sampling_steps,
|
| 675 |
-
# slat_guidance_strength, slat_sampling_steps,
|
| 676 |
-
# mesh_simplify, texture_size,
|
| 677 |
-
# multiimage_algo
|
| 678 |
-
# ],
|
| 679 |
-
# outputs=[video_output, model_output, stl_download_button],
|
| 680 |
-
# show_progress="minimal",
|
| 681 |
-
# scroll_to_output=True,
|
| 682 |
-
# )
|
| 683 |
-
|
| 684 |
-
|
| 685 |
examples.dataset.select(
|
| 686 |
fn=get_seed,
|
| 687 |
inputs=[randomize_seed, seed_slider],
|
|
@@ -689,51 +418,37 @@ with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
|
|
| 689 |
show_progress="hidden",
|
| 690 |
trigger_mode="always_last",
|
| 691 |
).then(
|
| 692 |
-
fn=
|
| 693 |
-
inputs=
|
| 694 |
-
|
| 695 |
-
multiimage_prompt,
|
| 696 |
-
is_multiimage,
|
| 697 |
-
seed_slider,
|
| 698 |
-
ss_guidance_strength, ss_sampling_steps,
|
| 699 |
-
slat_guidance_strength, slat_sampling_steps,
|
| 700 |
-
mesh_simplify, texture_size,
|
| 701 |
-
multiimage_algo
|
| 702 |
-
],
|
| 703 |
-
outputs=[video_output, model_output, stl_download_button],
|
| 704 |
show_progress="hidden",
|
| 705 |
scroll_to_output=True,
|
| 706 |
)
|
| 707 |
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
@gr.on(triggers=[image_prompt.change], inputs=None, outputs=[video_output, model_output, stl_download_button], show_progress="minimal") # MODIFIED outputs
|
| 711 |
-
def toggle_outputs_on_new_image(): # RENAMED and MODIFIED
|
| 712 |
return (
|
| 713 |
gr.update(visible=True, value=None),
|
| 714 |
-
gr.update(visible=False, value=None),
|
| 715 |
gr.update(visible=False, value=None, interactive=False)
|
| 716 |
)
|
| 717 |
-
|
| 718 |
-
@gr.on(triggers=[video_output.change], inputs=None, outputs=[model_output, stl_download_button])
|
| 719 |
-
def
|
| 720 |
return (
|
| 721 |
-
gr.update(label="Interactive 3D Model", visible=True),
|
| 722 |
gr.update(visible=True)
|
| 723 |
)
|
| 724 |
-
|
| 725 |
@gr.on(triggers=[video_output.change], inputs=None, outputs=video_output, show_progress="hidden")
|
| 726 |
def toggle_label():
|
| 727 |
return gr.update(label="Double Tap To Play", visible=True)
|
| 728 |
|
| 729 |
|
| 730 |
-
print(f"[DIAG] reached launch section, __name__={__name__}", flush=True)
|
| 731 |
if __name__ == "__main__":
|
| 732 |
-
print("[DIAG] inside __main__ block", flush=True)
|
| 733 |
-
|
| 734 |
if pipeline is None:
|
| 735 |
logger.critical("Pipeline failed to initialize. Exiting.")
|
| 736 |
-
sys.exit(1)
|
| 737 |
|
| 738 |
running_on_spaces = bool(os.getenv("SPACE_ID"))
|
| 739 |
|
|
@@ -764,4 +479,4 @@ if __name__ == "__main__":
|
|
| 764 |
debug=True,
|
| 765 |
share=True,
|
| 766 |
allowed_paths=["./cache", "./assets"]
|
| 767 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
import time
|
| 5 |
+
import io
|
| 6 |
+
import base64
|
| 7 |
+
|
| 8 |
+
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = '1'
|
| 9 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 10 |
+
os.environ["ATTN_BACKEND"] = "flash_attn_3"
|
| 11 |
+
os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'autotune_cache.json')
|
| 12 |
+
os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = '1'
|
| 13 |
+
|
| 14 |
import gradio as gr
|
|
|
|
| 15 |
import spaces
|
| 16 |
from gradio_litmodel3d import LitModel3D
|
|
|
|
| 17 |
sys.path.append(os.getcwd())
|
| 18 |
+
import cv2
|
|
|
|
|
|
|
|
|
|
| 19 |
from typing import *
|
|
|
|
| 20 |
import torch
|
|
|
|
| 21 |
import numpy as np
|
| 22 |
import imageio
|
| 23 |
from PIL import Image
|
| 24 |
import trimesh
|
|
|
|
| 25 |
from datetime import datetime
|
| 26 |
import logging
|
| 27 |
|
| 28 |
+
from trellis2.modules.sparse import SparseTensor
|
| 29 |
+
from trellis2.pipelines import Trellis2ImageTo3DPipeline
|
| 30 |
+
from trellis2.renderers import EnvMap
|
| 31 |
+
from trellis2.utils import render_utils
|
| 32 |
+
import o_voxel
|
| 33 |
+
|
| 34 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
|
|
|
|
|
|
| 37 |
parser = argparse.ArgumentParser(description="Pocket 3D AI 2")
|
| 38 |
+
parser.add_argument("--prod", action="store_true", help="Run in production mode")
|
| 39 |
+
parser.add_argument("--port", type=int, help="Port to run the server on (default: 8081 for prod, 8080 for dev)")
|
| 40 |
+
cmd_args, _unknown_args = parser.parse_known_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
prod = cmd_args.prod
|
| 43 |
port = cmd_args.port if cmd_args.port else (8081 if prod else 8080)
|
| 44 |
show_options = not prod
|
| 45 |
|
| 46 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 47 |
TMP_DIR = os.path.join('cache')
|
| 48 |
os.makedirs(TMP_DIR, exist_ok=True)
|
| 49 |
|
|
|
|
| 50 |
if gr.NO_RELOAD:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
pipeline = None
|
| 52 |
+
envmap = None
|
| 53 |
|
| 54 |
def initialize_pipeline():
|
| 55 |
+
global pipeline, envmap
|
| 56 |
if pipeline is not None:
|
|
|
|
| 57 |
return
|
| 58 |
|
| 59 |
+
logger.info("Initializing TRELLIS.2 pipeline...")
|
| 60 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
try:
|
| 63 |
+
pipeline = Trellis2ImageTo3DPipeline.from_pretrained('microsoft/TRELLIS.2-4B')
|
| 64 |
+
pipeline.rembg_model = None
|
| 65 |
+
pipeline.low_vram = False
|
| 66 |
+
pipeline._device = 'cpu'
|
| 67 |
+
|
| 68 |
+
envmap = {}
|
| 69 |
+
for name in ['forest', 'sunset', 'courtyard']:
|
| 70 |
+
exr_path = os.path.join('assets', 'hdri', f'{name}.exr')
|
| 71 |
+
if os.path.exists(exr_path):
|
| 72 |
+
envmap[name] = cv2.cvtColor(
|
| 73 |
+
cv2.imread(exr_path, cv2.IMREAD_UNCHANGED),
|
| 74 |
+
cv2.COLOR_BGR2RGB
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
logger.info(f"Pipeline initialized in {time.time() - start_time:.2f} seconds.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
except Exception as e:
|
| 79 |
logger.error(f"Failed to initialize pipeline: {e}", exc_info=True)
|
| 80 |
+
pipeline = None
|
| 81 |
+
raise
|
|
|
|
| 82 |
|
| 83 |
initialize_pipeline()
|
| 84 |
|
| 85 |
+
|
| 86 |
+
def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
|
| 87 |
+
shape_slat, tex_slat, res = latents
|
| 88 |
+
return {
|
| 89 |
+
'shape_slat_feats': shape_slat.feats.cpu().numpy(),
|
| 90 |
+
'tex_slat_feats': tex_slat.feats.cpu().numpy(),
|
| 91 |
+
'coords': shape_slat.coords.cpu().numpy(),
|
| 92 |
+
'res': res,
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
|
| 97 |
+
shape_slat = SparseTensor(
|
| 98 |
+
feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
|
| 99 |
+
coords=torch.from_numpy(state['coords']).cuda(),
|
| 100 |
+
)
|
| 101 |
+
tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
|
| 102 |
+
return shape_slat, tex_slat, state['res']
|
| 103 |
+
|
| 104 |
|
| 105 |
def start_session(req: gr.Request):
|
|
|
|
| 106 |
torch.cuda.empty_cache()
|
| 107 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
| 108 |
+
|
| 109 |
|
| 110 |
def end_session(req: gr.Request):
|
| 111 |
torch.cuda.empty_cache()
|
| 112 |
|
| 113 |
+
|
| 114 |
def preprocess_image(image: Optional[Image.Image]) -> Optional[Image.Image]:
|
| 115 |
if image is None:
|
|
|
|
| 116 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
try:
|
| 118 |
+
return pipeline.preprocess_image(image)
|
|
|
|
| 119 |
except Exception as e:
|
| 120 |
logger.error(f"Error during image preprocessing: {e}", exc_info=True)
|
| 121 |
+
return None
|
| 122 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
def get_seed(randomize_seed: bool, seed: int) -> int:
|
| 125 |
+
return np.random.randint(0, MAX_SEED) if randomize_seed else seed
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
| 127 |
|
| 128 |
+
@spaces.GPU(duration=120)
|
| 129 |
+
def process_image_yielding(
|
| 130 |
+
image: Optional[Image.Image],
|
| 131 |
seed: int,
|
| 132 |
+
resolution: str,
|
| 133 |
ss_guidance_strength: float,
|
| 134 |
ss_sampling_steps: int,
|
| 135 |
+
shape_guidance_strength: float,
|
| 136 |
+
shape_sampling_steps: int,
|
| 137 |
+
tex_guidance_strength: float,
|
| 138 |
+
tex_sampling_steps: int,
|
| 139 |
+
mesh_simplify: int,
|
| 140 |
+
texture_size: int,
|
| 141 |
+
req: gr.Request,
|
| 142 |
progress=gr.Progress(track_tqdm=True)
|
| 143 |
+
) -> Generator:
|
| 144 |
if image is None or pipeline is None:
|
| 145 |
+
return
|
|
|
|
| 146 |
|
| 147 |
+
pipeline.cuda()
|
| 148 |
+
loaded_envmap = {}
|
| 149 |
+
for name, exr_data in envmap.items():
|
| 150 |
+
loaded_envmap[name] = EnvMap(torch.tensor(exr_data, dtype=torch.float32, device='cuda'))
|
| 151 |
|
| 152 |
+
yield None, None, gr.update(value=None, interactive=False), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
progress(0, desc="Removing background...")
|
| 155 |
+
image = preprocess_image(image)
|
| 156 |
+
if image is None:
|
| 157 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
progress(0.1, desc="Generating 3D structure...")
|
| 160 |
pipeline_start = time.time()
|
| 161 |
|
| 162 |
try:
|
| 163 |
+
outputs, latents = pipeline.run(
|
| 164 |
+
image,
|
| 165 |
seed=seed,
|
|
|
|
| 166 |
preprocess_image=False,
|
| 167 |
sparse_structure_sampler_params={
|
| 168 |
"steps": ss_sampling_steps,
|
| 169 |
+
"guidance_strength": ss_guidance_strength,
|
| 170 |
},
|
| 171 |
+
shape_slat_sampler_params={
|
| 172 |
+
"steps": shape_sampling_steps,
|
| 173 |
+
"guidance_strength": shape_guidance_strength,
|
| 174 |
},
|
| 175 |
+
tex_slat_sampler_params={
|
| 176 |
+
"steps": tex_sampling_steps,
|
| 177 |
+
"guidance_strength": tex_guidance_strength,
|
| 178 |
+
},
|
| 179 |
+
pipeline_type={
|
| 180 |
+
"512": "512",
|
| 181 |
+
"1024": "1024_cascade",
|
| 182 |
+
"1536": "1536_cascade",
|
| 183 |
+
}[resolution],
|
| 184 |
+
return_latent=True,
|
| 185 |
)
|
|
|
|
|
|
|
| 186 |
except Exception as e:
|
| 187 |
+
logger.error(f"Error during pipeline run: {e}", exc_info=True)
|
| 188 |
torch.cuda.empty_cache()
|
| 189 |
+
return
|
| 190 |
|
| 191 |
+
logger.info(f"Pipeline Time: {time.time() - pipeline_start:.2f} seconds")
|
| 192 |
|
| 193 |
+
mesh = outputs[0]
|
| 194 |
+
mesh.simplify(16777216)
|
| 195 |
+
state = pack_state(latents)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
progress(0.5, desc="Rendering preview video...")
|
| 198 |
+
user_agent = req.headers.get("User-Agent", "").lower()
|
| 199 |
+
is_mobile = any(d in user_agent for d in ["android", "iphone", "ipad", "mobile"])
|
| 200 |
+
vid_resolution = 256 if is_mobile else 384
|
| 201 |
+
num_frames = 45
|
| 202 |
|
| 203 |
try:
|
| 204 |
+
vid_result = render_utils.render_video(mesh, resolution=vid_resolution, num_frames=num_frames, r=2, fov=36, envmap=loaded_envmap)
|
| 205 |
+
color_frames = vid_result.get('shaded', vid_result.get('color', []))
|
| 206 |
+
normal_frames = vid_result.get('normal', [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
if color_frames and normal_frames:
|
| 209 |
+
if is_mobile:
|
| 210 |
+
combined = [np.concatenate([c, n], axis=0) for c, n in zip(color_frames, normal_frames)]
|
| 211 |
+
else:
|
| 212 |
+
combined = [np.concatenate([c, n], axis=1) for c, n in zip(color_frames, normal_frames)]
|
| 213 |
+
|
| 214 |
+
current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 215 |
+
video_path = os.path.join(TMP_DIR, f'{current_time}.mp4')
|
| 216 |
+
imageio.mimsave(video_path, combined, fps=15)
|
| 217 |
+
logger.info(f"Video rendered: {video_path}")
|
| 218 |
+
yield video_path, None, gr.update(value=None, interactive=False), state
|
| 219 |
+
elif color_frames:
|
| 220 |
+
current_time = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 221 |
+
video_path = os.path.join(TMP_DIR, f'{current_time}.mp4')
|
| 222 |
+
imageio.mimsave(video_path, color_frames, fps=15)
|
| 223 |
+
yield video_path, None, gr.update(value=None, interactive=False), state
|
| 224 |
except Exception as e:
|
| 225 |
+
logger.error(f"Video rendering error: {e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
progress(0.7, desc="Extracting GLB model...")
|
| 228 |
try:
|
| 229 |
+
shape_slat, tex_slat, res = unpack_state(state)
|
| 230 |
+
decoded_mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]
|
| 231 |
+
decoded_mesh.simplify(16777216)
|
| 232 |
+
|
| 233 |
+
glb = o_voxel.postprocess.to_glb(
|
| 234 |
+
vertices=decoded_mesh.vertices,
|
| 235 |
+
faces=decoded_mesh.faces,
|
| 236 |
+
attr_volume=decoded_mesh.attrs,
|
| 237 |
+
coords=decoded_mesh.coords,
|
| 238 |
+
attr_layout=pipeline.pbr_attr_layout,
|
| 239 |
+
grid_size=res,
|
| 240 |
+
aabb=[[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]],
|
| 241 |
+
decimation_target=mesh_simplify,
|
| 242 |
texture_size=texture_size,
|
| 243 |
+
remesh=True,
|
| 244 |
+
remesh_band=1,
|
| 245 |
+
remesh_project=0,
|
| 246 |
+
use_tqdm=True,
|
| 247 |
)
|
| 248 |
+
|
| 249 |
current_time_glb = datetime.now().strftime("%Y-%m%d-%H%M%S")
|
| 250 |
+
glb_path = os.path.join(TMP_DIR, f'{current_time_glb}.glb')
|
| 251 |
+
glb.export(glb_path, extension_webp=True)
|
| 252 |
+
logger.info(f"GLB exported: {glb_path}")
|
| 253 |
|
| 254 |
+
stl_path = None
|
| 255 |
try:
|
| 256 |
mesh_data = trimesh.load_mesh(glb_path, force='mesh')
|
| 257 |
mesh_to_export = None
|
| 258 |
|
| 259 |
if isinstance(mesh_data, trimesh.Scene):
|
| 260 |
+
geometries = [g for g in mesh_data.geometry.values() if isinstance(g, trimesh.Trimesh)]
|
| 261 |
+
valid = [g for g in geometries if g.vertices is not None and len(g.vertices) > 0]
|
| 262 |
+
if valid:
|
| 263 |
+
combined_mesh = trimesh.util.concatenate(valid)
|
| 264 |
+
if isinstance(combined_mesh, trimesh.Trimesh) and len(combined_mesh.vertices) > 0:
|
| 265 |
+
mesh_to_export = combined_mesh
|
| 266 |
+
elif isinstance(mesh_data, trimesh.Trimesh) and len(mesh_data.vertices) > 0:
|
| 267 |
+
mesh_to_export = mesh_data
|
| 268 |
+
|
| 269 |
+
if mesh_to_export and mesh_to_export.faces is not None and len(mesh_to_export.faces) > 0:
|
| 270 |
+
mesh_to_export = mesh_to_export.copy()
|
| 271 |
+
rot_x_90 = trimesh.transformations.rotation_matrix(np.deg2rad(90), [1, 0, 0])
|
| 272 |
+
mesh_to_export.apply_transform(rot_x_90)
|
| 273 |
+
bbox = mesh_to_export.bounds
|
| 274 |
+
current_size = (bbox[1] - bbox[0]).max()
|
| 275 |
+
target_size_mm = 152.4
|
| 276 |
+
if current_size > 0:
|
| 277 |
+
mesh_to_export.vertices *= target_size_mm / current_size
|
| 278 |
+
current_time_stl = datetime.now().strftime("%Y-%m%d-%H%M%S-%f")
|
| 279 |
+
stl_path = os.path.join(TMP_DIR, f'{current_time_stl}.stl')
|
| 280 |
+
mesh_to_export.export(stl_path)
|
| 281 |
+
logger.info(f"STL exported: {stl_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
except Exception as stl_e:
|
| 283 |
+
logger.error(f"STL export error: {stl_e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
+
stl_update = gr.update(value=stl_path, interactive=True) if stl_path else gr.update(value=None, interactive=False)
|
| 286 |
+
yield video_path, glb_path, stl_update, state
|
| 287 |
|
| 288 |
+
except Exception as e:
|
| 289 |
+
logger.error(f"GLB extraction error: {e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
torch.cuda.empty_cache()
|
| 292 |
|
| 293 |
|
|
|
|
| 295 |
h1, h2, h3 { text-align: center; display: block; }
|
| 296 |
footer { visibility: hidden; }
|
| 297 |
.gradio-container { max-width: 1024px !important; }
|
|
|
|
| 298 |
.gr-image-container { display: flex !important; justify-content: center !important; align-items: center !important; width: 100%; height: 240px; }
|
| 299 |
.gr-image-container img { width: 100%; height: 100%; object-fit: contain; object-position: center; }
|
|
|
|
| 300 |
@media screen and (min-width: 768px) {
|
| 301 |
.gr-image-container { height: 360px !important; }
|
| 302 |
+
.video-container { height: 360px !important; max-width: 680px !important; margin: 0 auto !important; aspect-ratio: auto !important; }
|
| 303 |
.model-container { height: 480px !important; max-width: 680px !important; margin: 0 auto !important; }
|
| 304 |
}
|
| 305 |
.custom-header { display: flex; align-items: center; height: 100%; }
|
| 306 |
"""
|
| 307 |
|
| 308 |
with gr.Blocks(theme='Taithrah/Minimal', css=css, title="Pocket 3D AI") as demo:
|
| 309 |
+
output_state = gr.State()
|
| 310 |
+
|
| 311 |
with gr.Row(equal_height=True):
|
| 312 |
gr.Image("assets/sb_pocket_logo_dark.png", show_label=False, container=False, show_download_button=False, min_width=50, interactive=False, show_fullscreen_button=False)
|
| 313 |
|
| 314 |
with gr.Column():
|
| 315 |
with gr.Row():
|
| 316 |
with gr.Column(scale=2, min_width=100, variant="default"):
|
| 317 |
+
image_prompt = gr.Image(
|
| 318 |
+
label="Input",
|
| 319 |
+
format="png",
|
| 320 |
+
image_mode="RGBA",
|
| 321 |
+
type="pil",
|
| 322 |
+
sources=['upload', 'clipboard'],
|
| 323 |
+
container=True,
|
| 324 |
+
mirror_webcam=True,
|
| 325 |
+
visible=True,
|
| 326 |
+
height=240,
|
| 327 |
+
elem_classes="gr-image-container",
|
| 328 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
with gr.Column(scale=5, min_width=100):
|
| 331 |
+
video_output = gr.Video(
|
| 332 |
+
label=" ",
|
| 333 |
+
height=240,
|
| 334 |
+
elem_classes="video-container",
|
| 335 |
+
visible=False,
|
| 336 |
+
autoplay=True,
|
| 337 |
+
loop=True,
|
| 338 |
+
show_download_button=True,
|
| 339 |
+
interactive=False,
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
with gr.Row(equal_height=False):
|
| 343 |
with gr.Column(scale=2, min_width=100, variant="default"):
|
| 344 |
examples = gr.Examples(
|
|
|
|
| 346 |
f'./assets/example_image/{image}'
|
| 347 |
for image in os.listdir("./assets/example_image")
|
| 348 |
],
|
| 349 |
+
inputs=[image_prompt],
|
| 350 |
examples_per_page=9,
|
| 351 |
)
|
| 352 |
with gr.Column(scale=5):
|
| 353 |
+
model_output = LitModel3D(
|
| 354 |
+
label="",
|
| 355 |
+
container=True,
|
| 356 |
+
zoom_speed=0.5,
|
| 357 |
+
pan_speed=3.0,
|
| 358 |
+
exposure=10.0,
|
| 359 |
+
height=360,
|
| 360 |
+
elem_classes="model-container",
|
| 361 |
+
visible=False,
|
| 362 |
+
)
|
| 363 |
stl_download_button = gr.DownloadButton(label="Download STL", visible=False, interactive=False, size="lg", variant="primary")
|
| 364 |
|
| 365 |
with gr.Accordion(label="Generation Settings", open=False, visible=not prod):
|
| 366 |
seed_slider = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
|
| 367 |
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
|
| 368 |
+
resolution_radio = gr.Radio(["512", "1024", "1536"], label="Resolution", value="1024")
|
| 369 |
gr.Markdown("Stage 1: Sparse Structure Generation")
|
| 370 |
with gr.Row():
|
| 371 |
+
ss_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
|
| 372 |
ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
|
| 373 |
+
gr.Markdown("Stage 2: Shape Generation")
|
| 374 |
with gr.Row():
|
| 375 |
+
shape_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
|
| 376 |
+
shape_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
|
| 377 |
+
gr.Markdown("Stage 3: Texture Generation")
|
| 378 |
+
with gr.Row():
|
| 379 |
+
tex_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=1.0, step=0.1)
|
| 380 |
+
tex_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
|
| 381 |
|
| 382 |
with gr.Accordion(label="GLB Extraction Settings", open=False, visible=not prod):
|
| 383 |
+
mesh_simplify = gr.Slider(100000, 500000, label="Decimation Target", value=300000, step=10000)
|
| 384 |
+
texture_size = gr.Slider(1024, 4096, label="Texture Size", value=2048, step=1024)
|
|
|
|
|
|
|
| 385 |
|
| 386 |
demo.load(start_session)
|
| 387 |
demo.unload(end_session)
|
| 388 |
|
| 389 |
+
generation_inputs = [
|
| 390 |
+
image_prompt,
|
| 391 |
+
seed_slider,
|
| 392 |
+
resolution_radio,
|
| 393 |
+
ss_guidance_strength, ss_sampling_steps,
|
| 394 |
+
shape_guidance_strength, shape_sampling_steps,
|
| 395 |
+
tex_guidance_strength, tex_sampling_steps,
|
| 396 |
+
mesh_simplify, texture_size,
|
| 397 |
+
]
|
| 398 |
+
generation_outputs = [video_output, model_output, stl_download_button, output_state]
|
| 399 |
|
| 400 |
image_prompt.upload(
|
| 401 |
get_seed,
|
|
|
|
| 404 |
show_progress="hidden",
|
| 405 |
trigger_mode="always_last"
|
| 406 |
).then(
|
| 407 |
+
fn=process_image_yielding,
|
| 408 |
+
inputs=generation_inputs,
|
| 409 |
+
outputs=generation_outputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
show_progress="hidden",
|
| 411 |
scroll_to_output=True,
|
| 412 |
)
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
examples.dataset.select(
|
| 415 |
fn=get_seed,
|
| 416 |
inputs=[randomize_seed, seed_slider],
|
|
|
|
| 418 |
show_progress="hidden",
|
| 419 |
trigger_mode="always_last",
|
| 420 |
).then(
|
| 421 |
+
fn=process_image_yielding,
|
| 422 |
+
inputs=generation_inputs,
|
| 423 |
+
outputs=generation_outputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
show_progress="hidden",
|
| 425 |
scroll_to_output=True,
|
| 426 |
)
|
| 427 |
|
| 428 |
+
@gr.on(triggers=[image_prompt.change], inputs=None, outputs=[video_output, model_output, stl_download_button], show_progress="minimal")
|
| 429 |
+
def toggle_outputs_on_new_image():
|
|
|
|
|
|
|
| 430 |
return (
|
| 431 |
gr.update(visible=True, value=None),
|
| 432 |
+
gr.update(visible=False, value=None),
|
| 433 |
gr.update(visible=False, value=None, interactive=False)
|
| 434 |
)
|
| 435 |
+
|
| 436 |
+
@gr.on(triggers=[video_output.change], inputs=None, outputs=[model_output, stl_download_button])
|
| 437 |
+
def toggle_model_and_stl_visibility():
|
| 438 |
return (
|
| 439 |
+
gr.update(label="Interactive 3D Model", visible=True),
|
| 440 |
gr.update(visible=True)
|
| 441 |
)
|
| 442 |
+
|
| 443 |
@gr.on(triggers=[video_output.change], inputs=None, outputs=video_output, show_progress="hidden")
|
| 444 |
def toggle_label():
|
| 445 |
return gr.update(label="Double Tap To Play", visible=True)
|
| 446 |
|
| 447 |
|
|
|
|
| 448 |
if __name__ == "__main__":
|
|
|
|
|
|
|
| 449 |
if pipeline is None:
|
| 450 |
logger.critical("Pipeline failed to initialize. Exiting.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
|
| 453 |
running_on_spaces = bool(os.getenv("SPACE_ID"))
|
| 454 |
|
|
|
|
| 479 |
debug=True,
|
| 480 |
share=True,
|
| 481 |
allowed_paths=["./cache", "./assets"]
|
| 482 |
+
)
|
wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl → assets/app/basecolor.png
RENAMED
|
File without changes
|
assets/app/clay.png
ADDED
|
Git LFS Details
|
assets/app/hdri_city.png
ADDED
|
Git LFS Details
|
assets/app/hdri_courtyard.png
ADDED
|
Git LFS Details
|
assets/app/hdri_forest.png
ADDED
|
Git LFS Details
|
assets/app/hdri_interior.png
ADDED
|
Git LFS Details
|
assets/app/hdri_night.png
ADDED
|
Git LFS Details
|
assets/app/hdri_studio.png
ADDED
|
Git LFS Details
|
assets/app/hdri_sunrise.png
ADDED
|
Git LFS Details
|
assets/app/hdri_sunset.png
ADDED
|
Git LFS Details
|
assets/app/normal.png
ADDED
|
Git LFS Details
|
assets/hdri/city.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/courtyard.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/forest.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/interior.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/license.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
All HDRIs are licensed as CC0.
|
| 2 |
+
|
| 3 |
+
These were created by Greg Zaal (Poly Haven https://polyhaven.com).
|
| 4 |
+
Originals used for each HDRI:
|
| 5 |
+
- City: https://polyhaven.com/a/portland_landing_pad
|
| 6 |
+
- Courtyard: https://polyhaven.com/a/courtyard
|
| 7 |
+
- Forest: https://polyhaven.com/a/ninomaru_teien
|
| 8 |
+
- Interior: https://polyhaven.com/a/hotel_room
|
| 9 |
+
- Night: Probably https://polyhaven.com/a/moonless_golf
|
| 10 |
+
- Studio: Probably https://polyhaven.com/a/studio_small_01
|
| 11 |
+
- Sunrise: https://polyhaven.com/a/spruit_sunrise
|
| 12 |
+
- Sunset: https://polyhaven.com/a/venice_sunset
|
| 13 |
+
|
| 14 |
+
1K resolution of each was taken, and compressed with oiiotool:
|
| 15 |
+
oiiotool input.exr --ch R,G,B -d float --compression dwab:300 --clamp:min=0.0:max=32000.0 -o output.exr
|
assets/hdri/night.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/studio.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/sunrise.exr
ADDED
|
|
Git LFS Details
|
assets/hdri/sunset.exr
ADDED
|
|
Git LFS Details
|
autotune_cache.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
extensions/nvdiffrast/LICENSE.txt
DELETED
|
@@ -1,97 +0,0 @@
|
|
| 1 |
-
Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
Nvidia Source Code License (1-Way Commercial)
|
| 5 |
-
|
| 6 |
-
=======================================================================
|
| 7 |
-
|
| 8 |
-
1. Definitions
|
| 9 |
-
|
| 10 |
-
"Licensor" means any person or entity that distributes its Work.
|
| 11 |
-
|
| 12 |
-
"Software" means the original work of authorship made available under
|
| 13 |
-
this License.
|
| 14 |
-
|
| 15 |
-
"Work" means the Software and any additions to or derivative works of
|
| 16 |
-
the Software that are made available under this License.
|
| 17 |
-
|
| 18 |
-
The terms "reproduce," "reproduction," "derivative works," and
|
| 19 |
-
"distribution" have the meaning as provided under U.S. copyright law;
|
| 20 |
-
provided, however, that for the purposes of this License, derivative
|
| 21 |
-
works shall not include works that remain separable from, or merely
|
| 22 |
-
link (or bind by name) to the interfaces of, the Work.
|
| 23 |
-
|
| 24 |
-
Works, including the Software, are "made available" under this License
|
| 25 |
-
by including in or with the Work either (a) a copyright notice
|
| 26 |
-
referencing the applicability of this License to the Work, or (b) a
|
| 27 |
-
copy of this License.
|
| 28 |
-
|
| 29 |
-
2. License Grants
|
| 30 |
-
|
| 31 |
-
2.1 Copyright Grant. Subject to the terms and conditions of this
|
| 32 |
-
License, each Licensor grants to you a perpetual, worldwide,
|
| 33 |
-
non-exclusive, royalty-free, copyright license to reproduce,
|
| 34 |
-
prepare derivative works of, publicly display, publicly perform,
|
| 35 |
-
sublicense and distribute its Work and any resulting derivative
|
| 36 |
-
works in any form.
|
| 37 |
-
|
| 38 |
-
3. Limitations
|
| 39 |
-
|
| 40 |
-
3.1 Redistribution. You may reproduce or distribute the Work only
|
| 41 |
-
if (a) you do so under this License, (b) you include a complete
|
| 42 |
-
copy of this License with your distribution, and (c) you retain
|
| 43 |
-
without modification any copyright, patent, trademark, or
|
| 44 |
-
attribution notices that are present in the Work.
|
| 45 |
-
|
| 46 |
-
3.2 Derivative Works. You may specify that additional or different
|
| 47 |
-
terms apply to the use, reproduction, and distribution of your
|
| 48 |
-
derivative works of the Work ("Your Terms") only if (a) Your Terms
|
| 49 |
-
provide that the use limitation in Section 3.3 applies to your
|
| 50 |
-
derivative works, and (b) you identify the specific derivative
|
| 51 |
-
works that are subject to Your Terms. Notwithstanding Your Terms,
|
| 52 |
-
this License (including the redistribution requirements in Section
|
| 53 |
-
3.1) will continue to apply to the Work itself.
|
| 54 |
-
|
| 55 |
-
3.3 Use Limitation. The Work and any derivative works thereof only
|
| 56 |
-
may be used or intended for use non-commercially. The Work or
|
| 57 |
-
derivative works thereof may be used or intended for use by Nvidia
|
| 58 |
-
or its affiliates commercially or non-commercially. As used herein,
|
| 59 |
-
"non-commercially" means for research or evaluation purposes only
|
| 60 |
-
and not for any direct or indirect monetary gain.
|
| 61 |
-
|
| 62 |
-
3.4 Patent Claims. If you bring or threaten to bring a patent claim
|
| 63 |
-
against any Licensor (including any claim, cross-claim or
|
| 64 |
-
counterclaim in a lawsuit) to enforce any patents that you allege
|
| 65 |
-
are infringed by any Work, then your rights under this License from
|
| 66 |
-
such Licensor (including the grant in Section 2.1) will terminate
|
| 67 |
-
immediately.
|
| 68 |
-
|
| 69 |
-
3.5 Trademarks. This License does not grant any rights to use any
|
| 70 |
-
Licensor's or its affiliates' names, logos, or trademarks, except
|
| 71 |
-
as necessary to reproduce the notices described in this License.
|
| 72 |
-
|
| 73 |
-
3.6 Termination. If you violate any term of this License, then your
|
| 74 |
-
rights under this License (including the grant in Section 2.1) will
|
| 75 |
-
terminate immediately.
|
| 76 |
-
|
| 77 |
-
4. Disclaimer of Warranty.
|
| 78 |
-
|
| 79 |
-
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
| 80 |
-
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
|
| 81 |
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
|
| 82 |
-
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
|
| 83 |
-
THIS LICENSE.
|
| 84 |
-
|
| 85 |
-
5. Limitation of Liability.
|
| 86 |
-
|
| 87 |
-
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
|
| 88 |
-
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
|
| 89 |
-
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
|
| 90 |
-
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
|
| 91 |
-
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
|
| 92 |
-
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
|
| 93 |
-
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
|
| 94 |
-
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
|
| 95 |
-
THE POSSIBILITY OF SUCH DAMAGES.
|
| 96 |
-
|
| 97 |
-
=======================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/README.md
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
|
| 2 |
-
|
| 3 |
-

|
| 4 |
-
|
| 5 |
-
**Modular Primitives for High-Performance Differentiable Rendering**<br>
|
| 6 |
-
Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
|
| 7 |
-
[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
|
| 8 |
-
|
| 9 |
-
Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
|
| 10 |
-
Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) ☜☜ for more information.
|
| 11 |
-
|
| 12 |
-
## Licenses
|
| 13 |
-
|
| 14 |
-
Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.
|
| 15 |
-
|
| 16 |
-
This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
|
| 17 |
-
|
| 18 |
-
For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
|
| 19 |
-
|
| 20 |
-
We do not currently accept outside code contributions in the form of pull requests.
|
| 21 |
-
|
| 22 |
-
Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
|
| 23 |
-
[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
|
| 24 |
-
originally shared under
|
| 25 |
-
[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
|
| 26 |
-
Mesh and texture stored as part of `samples/data/earth.npz` are derived from
|
| 27 |
-
[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
|
| 28 |
-
model originally made available under
|
| 29 |
-
[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
|
| 30 |
-
|
| 31 |
-
## Citation
|
| 32 |
-
|
| 33 |
-
```
|
| 34 |
-
@article{Laine2020diffrast,
|
| 35 |
-
title = {Modular Primitives for High-Performance Differentiable Rendering},
|
| 36 |
-
author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
|
| 37 |
-
journal = {ACM Transactions on Graphics},
|
| 38 |
-
year = {2020},
|
| 39 |
-
volume = {39},
|
| 40 |
-
number = {6}
|
| 41 |
-
}
|
| 42 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/__init__.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
#
|
| 3 |
-
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
# and proprietary rights in and to this software, related documentation
|
| 5 |
-
# and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
# distribution of this software and related documentation without an express
|
| 7 |
-
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
__version__ = '0.3.3'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/antialias.cu
DELETED
|
@@ -1,558 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "antialias.h"
|
| 10 |
-
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
// Helpers.
|
| 13 |
-
|
| 14 |
-
#define F32_MAX (3.402823466e+38f)
|
| 15 |
-
static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
|
| 16 |
-
static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
|
| 17 |
-
static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
|
| 18 |
-
{
|
| 19 |
-
bool g10 = rational_gt(n1, n0, d1, d0);
|
| 20 |
-
bool g20 = rational_gt(n2, n0, d2, d0);
|
| 21 |
-
bool g21 = rational_gt(n2, n1, d2, d1);
|
| 22 |
-
if (g20 && g21) return 2;
|
| 23 |
-
if (g10) return 1;
|
| 24 |
-
return 0;
|
| 25 |
-
}
|
| 26 |
-
|
| 27 |
-
//------------------------------------------------------------------------
|
| 28 |
-
// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
|
| 29 |
-
|
| 30 |
-
struct AAWorkItem
|
| 31 |
-
{
|
| 32 |
-
enum
|
| 33 |
-
{
|
| 34 |
-
EDGE_MASK = 3, // Edge index in lowest bits.
|
| 35 |
-
FLAG_DOWN_BIT = 2, // Down instead of right.
|
| 36 |
-
FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
|
| 37 |
-
};
|
| 38 |
-
|
| 39 |
-
int px, py; // Pixel x, y.
|
| 40 |
-
unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
|
| 41 |
-
float alpha; // Antialiasing alpha value. Zero if no AA.
|
| 42 |
-
};
|
| 43 |
-
|
| 44 |
-
//------------------------------------------------------------------------
|
| 45 |
-
// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
|
| 46 |
-
|
| 47 |
-
#define JENKINS_MAGIC (0x9e3779b9u)
|
| 48 |
-
static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
|
| 49 |
-
{
|
| 50 |
-
a -= b; a -= c; a ^= (c>>13);
|
| 51 |
-
b -= c; b -= a; b ^= (a<<8);
|
| 52 |
-
c -= a; c -= b; c ^= (b>>13);
|
| 53 |
-
a -= b; a -= c; a ^= (c>>12);
|
| 54 |
-
b -= c; b -= a; b ^= (a<<16);
|
| 55 |
-
c -= a; c -= b; c ^= (b>>5);
|
| 56 |
-
a -= b; a -= c; a ^= (c>>3);
|
| 57 |
-
b -= c; b -= a; b ^= (a<<10);
|
| 58 |
-
c -= a; c -= b; c ^= (b>>15);
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
|
| 62 |
-
class HashIndex
|
| 63 |
-
{
|
| 64 |
-
public:
|
| 65 |
-
__device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
|
| 66 |
-
{
|
| 67 |
-
m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
|
| 68 |
-
m_idx = (uint32_t)(key & 0xffffffffu);
|
| 69 |
-
m_skip = (uint32_t)(key >> 32);
|
| 70 |
-
uint32_t dummy = JENKINS_MAGIC;
|
| 71 |
-
jenkins_mix(m_idx, m_skip, dummy);
|
| 72 |
-
m_idx &= m_mask;
|
| 73 |
-
m_skip &= m_mask;
|
| 74 |
-
m_skip |= 1;
|
| 75 |
-
}
|
| 76 |
-
__device__ __forceinline__ int get(void) const { return m_idx; }
|
| 77 |
-
__device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
|
| 78 |
-
private:
|
| 79 |
-
uint32_t m_idx, m_skip, m_mask;
|
| 80 |
-
};
|
| 81 |
-
|
| 82 |
-
static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
|
| 83 |
-
{
|
| 84 |
-
HashIndex idx(p, key);
|
| 85 |
-
while(1)
|
| 86 |
-
{
|
| 87 |
-
uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
|
| 88 |
-
if (prev == 0 || prev == key)
|
| 89 |
-
break;
|
| 90 |
-
idx.next();
|
| 91 |
-
}
|
| 92 |
-
int* q = (int*)&p.evHash[idx.get()];
|
| 93 |
-
int a = atomicCAS(q+2, 0, v);
|
| 94 |
-
if (a != 0 && a != v)
|
| 95 |
-
atomicCAS(q+3, 0, v);
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
|
| 99 |
-
{
|
| 100 |
-
HashIndex idx(p, key);
|
| 101 |
-
while(1)
|
| 102 |
-
{
|
| 103 |
-
uint4 entry = p.evHash[idx.get()];
|
| 104 |
-
uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
|
| 105 |
-
if (k == key || k == 0)
|
| 106 |
-
return make_int2((int)entry.z, (int)entry.w);
|
| 107 |
-
idx.next();
|
| 108 |
-
}
|
| 109 |
-
}
|
| 110 |
-
|
| 111 |
-
static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
|
| 112 |
-
{
|
| 113 |
-
if (va == vb)
|
| 114 |
-
return;
|
| 115 |
-
|
| 116 |
-
uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
|
| 117 |
-
uint64_t v1 = (uint32_t)max(va, vb) + 1;
|
| 118 |
-
uint64_t vk = v0 | (v1 << 32); // hash key
|
| 119 |
-
hash_insert(p, vk, vn + 1);
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
|
| 123 |
-
{
|
| 124 |
-
if (va == vb)
|
| 125 |
-
return -1;
|
| 126 |
-
|
| 127 |
-
uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
|
| 128 |
-
uint64_t v1 = (uint32_t)max(va, vb) + 1;
|
| 129 |
-
uint64_t vk = v0 | (v1 << 32); // hash key
|
| 130 |
-
int2 vn = hash_find(p, vk) - 1;
|
| 131 |
-
if (vn.x == vr) return vn.y;
|
| 132 |
-
if (vn.y == vr) return vn.x;
|
| 133 |
-
return -1;
|
| 134 |
-
}
|
| 135 |
-
|
| 136 |
-
//------------------------------------------------------------------------
|
| 137 |
-
// Mesh analysis kernel.
|
| 138 |
-
|
| 139 |
-
__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
|
| 140 |
-
{
|
| 141 |
-
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
| 142 |
-
if (idx >= p.numTriangles)
|
| 143 |
-
return;
|
| 144 |
-
|
| 145 |
-
int v0 = p.tri[idx * 3 + 0];
|
| 146 |
-
int v1 = p.tri[idx * 3 + 1];
|
| 147 |
-
int v2 = p.tri[idx * 3 + 2];
|
| 148 |
-
|
| 149 |
-
if (v0 < 0 || v0 >= p.numVertices ||
|
| 150 |
-
v1 < 0 || v1 >= p.numVertices ||
|
| 151 |
-
v2 < 0 || v2 >= p.numVertices)
|
| 152 |
-
return;
|
| 153 |
-
|
| 154 |
-
if (v0 == v1 || v1 == v2 || v2 == v0)
|
| 155 |
-
return;
|
| 156 |
-
|
| 157 |
-
evhash_insert_vertex(p, v1, v2, v0);
|
| 158 |
-
evhash_insert_vertex(p, v2, v0, v1);
|
| 159 |
-
evhash_insert_vertex(p, v0, v1, v2);
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
//------------------------------------------------------------------------
|
| 163 |
-
// Discontinuity finder kernel.
|
| 164 |
-
|
| 165 |
-
__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
|
| 166 |
-
{
|
| 167 |
-
// Calculate pixel position.
|
| 168 |
-
int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
|
| 169 |
-
int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
|
| 170 |
-
int pz = blockIdx.z;
|
| 171 |
-
if (px >= p.width || py >= p.height || pz >= p.n)
|
| 172 |
-
return;
|
| 173 |
-
|
| 174 |
-
// Pointer to our TriIdx and fetch.
|
| 175 |
-
int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
|
| 176 |
-
float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
|
| 177 |
-
|
| 178 |
-
// Look right, clamp at edge.
|
| 179 |
-
int pidx1 = pidx0;
|
| 180 |
-
if (px < p.width - 1)
|
| 181 |
-
pidx1 += 4;
|
| 182 |
-
float tri1 = p.rasterOut[pidx1];
|
| 183 |
-
|
| 184 |
-
// Look down, clamp at edge.
|
| 185 |
-
int pidx2 = pidx0;
|
| 186 |
-
if (py < p.height - 1)
|
| 187 |
-
pidx2 += p.width << 2;
|
| 188 |
-
float tri2 = p.rasterOut[pidx2];
|
| 189 |
-
|
| 190 |
-
// Determine amount of work.
|
| 191 |
-
int count = 0;
|
| 192 |
-
if (tri1 != tri0) count = 1;
|
| 193 |
-
if (tri2 != tri0) count += 1;
|
| 194 |
-
if (!count)
|
| 195 |
-
return; // Exit warp.
|
| 196 |
-
|
| 197 |
-
// Coalesce work counter update to once per CTA.
|
| 198 |
-
__shared__ int s_temp;
|
| 199 |
-
s_temp = 0;
|
| 200 |
-
__syncthreads();
|
| 201 |
-
int idx = atomicAdd(&s_temp, count);
|
| 202 |
-
__syncthreads();
|
| 203 |
-
if (idx == 0)
|
| 204 |
-
{
|
| 205 |
-
int base = atomicAdd(&p.workBuffer[0].x, s_temp);
|
| 206 |
-
s_temp = base + 1; // don't clobber the counters in first slot.
|
| 207 |
-
}
|
| 208 |
-
__syncthreads();
|
| 209 |
-
idx += s_temp;
|
| 210 |
-
|
| 211 |
-
// Write to memory.
|
| 212 |
-
if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
|
| 213 |
-
if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
|
| 214 |
-
}
|
| 215 |
-
|
| 216 |
-
//------------------------------------------------------------------------
|
| 217 |
-
// Forward analysis kernel.
|
| 218 |
-
|
| 219 |
-
__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
|
| 220 |
-
{
|
| 221 |
-
__shared__ int s_base;
|
| 222 |
-
int workCount = p.workBuffer[0].x;
|
| 223 |
-
for(;;)
|
| 224 |
-
{
|
| 225 |
-
// Persistent threads work fetcher.
|
| 226 |
-
__syncthreads();
|
| 227 |
-
if (threadIdx.x == 0)
|
| 228 |
-
s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
|
| 229 |
-
__syncthreads();
|
| 230 |
-
int thread_idx = s_base + threadIdx.x;
|
| 231 |
-
if (thread_idx >= workCount)
|
| 232 |
-
return;
|
| 233 |
-
|
| 234 |
-
int4* pItem = p.workBuffer + thread_idx + 1;
|
| 235 |
-
int4 item = *pItem;
|
| 236 |
-
int px = item.x;
|
| 237 |
-
int py = item.y;
|
| 238 |
-
int pz = (int)(((unsigned int)item.z) >> 16);
|
| 239 |
-
int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
|
| 240 |
-
|
| 241 |
-
int pixel0 = px + p.width * (py + p.height * pz);
|
| 242 |
-
int pixel1 = pixel0 + (d ? p.width : 1);
|
| 243 |
-
float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
|
| 244 |
-
float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
|
| 245 |
-
int tri0 = float_to_triidx(zt0.y) - 1;
|
| 246 |
-
int tri1 = float_to_triidx(zt1.y) - 1;
|
| 247 |
-
|
| 248 |
-
// Select triangle based on background / depth.
|
| 249 |
-
int tri = (tri0 >= 0) ? tri0 : tri1;
|
| 250 |
-
if (tri0 >= 0 && tri1 >= 0)
|
| 251 |
-
tri = (zt0.x < zt1.x) ? tri0 : tri1;
|
| 252 |
-
if (tri == tri1)
|
| 253 |
-
{
|
| 254 |
-
// Calculate with respect to neighbor pixel if chose that triangle.
|
| 255 |
-
px += 1 - d;
|
| 256 |
-
py += d;
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
// Bail out if triangle index is corrupt.
|
| 260 |
-
if (tri < 0 || tri >= p.numTriangles)
|
| 261 |
-
continue;
|
| 262 |
-
|
| 263 |
-
// Fetch vertex indices.
|
| 264 |
-
int vi0 = p.tri[tri * 3 + 0];
|
| 265 |
-
int vi1 = p.tri[tri * 3 + 1];
|
| 266 |
-
int vi2 = p.tri[tri * 3 + 2];
|
| 267 |
-
|
| 268 |
-
// Bail out if vertex indices are corrupt.
|
| 269 |
-
if (vi0 < 0 || vi0 >= p.numVertices ||
|
| 270 |
-
vi1 < 0 || vi1 >= p.numVertices ||
|
| 271 |
-
vi2 < 0 || vi2 >= p.numVertices)
|
| 272 |
-
continue;
|
| 273 |
-
|
| 274 |
-
// Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
|
| 275 |
-
int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
|
| 276 |
-
int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
|
| 277 |
-
int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
|
| 278 |
-
|
| 279 |
-
// Instance mode: Adjust vertex indices based on minibatch index.
|
| 280 |
-
if (p.instance_mode)
|
| 281 |
-
{
|
| 282 |
-
int vbase = pz * p.numVertices;
|
| 283 |
-
vi0 += vbase;
|
| 284 |
-
vi1 += vbase;
|
| 285 |
-
vi2 += vbase;
|
| 286 |
-
if (op0 >= 0) op0 += vbase;
|
| 287 |
-
if (op1 >= 0) op1 += vbase;
|
| 288 |
-
if (op2 >= 0) op2 += vbase;
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
// Fetch vertex positions.
|
| 292 |
-
float4 p0 = ((float4*)p.pos)[vi0];
|
| 293 |
-
float4 p1 = ((float4*)p.pos)[vi1];
|
| 294 |
-
float4 p2 = ((float4*)p.pos)[vi2];
|
| 295 |
-
float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
|
| 296 |
-
float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
|
| 297 |
-
float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
|
| 298 |
-
|
| 299 |
-
// Project vertices to pixel space.
|
| 300 |
-
float w0 = 1.f / p0.w;
|
| 301 |
-
float w1 = 1.f / p1.w;
|
| 302 |
-
float w2 = 1.f / p2.w;
|
| 303 |
-
float ow0 = 1.f / o0.w;
|
| 304 |
-
float ow1 = 1.f / o1.w;
|
| 305 |
-
float ow2 = 1.f / o2.w;
|
| 306 |
-
float fx = (float)px + .5f - p.xh;
|
| 307 |
-
float fy = (float)py + .5f - p.yh;
|
| 308 |
-
float x0 = p0.x * w0 * p.xh - fx;
|
| 309 |
-
float y0 = p0.y * w0 * p.yh - fy;
|
| 310 |
-
float x1 = p1.x * w1 * p.xh - fx;
|
| 311 |
-
float y1 = p1.y * w1 * p.yh - fy;
|
| 312 |
-
float x2 = p2.x * w2 * p.xh - fx;
|
| 313 |
-
float y2 = p2.y * w2 * p.yh - fy;
|
| 314 |
-
float ox0 = o0.x * ow0 * p.xh - fx;
|
| 315 |
-
float oy0 = o0.y * ow0 * p.yh - fy;
|
| 316 |
-
float ox1 = o1.x * ow1 * p.xh - fx;
|
| 317 |
-
float oy1 = o1.y * ow1 * p.yh - fy;
|
| 318 |
-
float ox2 = o2.x * ow2 * p.xh - fx;
|
| 319 |
-
float oy2 = o2.y * ow2 * p.yh - fy;
|
| 320 |
-
|
| 321 |
-
// Signs to kill non-silhouette edges.
|
| 322 |
-
float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
|
| 323 |
-
float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
|
| 324 |
-
float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
|
| 325 |
-
float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
|
| 326 |
-
|
| 327 |
-
// If no matching signs anywhere, skip the rest.
|
| 328 |
-
if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
|
| 329 |
-
{
|
| 330 |
-
// XY flip for horizontal edges.
|
| 331 |
-
if (d)
|
| 332 |
-
{
|
| 333 |
-
swap(x0, y0);
|
| 334 |
-
swap(x1, y1);
|
| 335 |
-
swap(x2, y2);
|
| 336 |
-
}
|
| 337 |
-
|
| 338 |
-
float dx0 = x2 - x1;
|
| 339 |
-
float dx1 = x0 - x2;
|
| 340 |
-
float dx2 = x1 - x0;
|
| 341 |
-
float dy0 = y2 - y1;
|
| 342 |
-
float dy1 = y0 - y2;
|
| 343 |
-
float dy2 = y1 - y0;
|
| 344 |
-
|
| 345 |
-
// Check if an edge crosses between us and the neighbor pixel.
|
| 346 |
-
float dc = -F32_MAX;
|
| 347 |
-
float ds = (tri == tri0) ? 1.f : -1.f;
|
| 348 |
-
float d0 = ds * (x1*dy0 - y1*dx0);
|
| 349 |
-
float d1 = ds * (x2*dy1 - y2*dx1);
|
| 350 |
-
float d2 = ds * (x0*dy2 - y0*dx2);
|
| 351 |
-
|
| 352 |
-
if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
|
| 353 |
-
if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
|
| 354 |
-
if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
|
| 355 |
-
|
| 356 |
-
int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
|
| 357 |
-
if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
|
| 358 |
-
if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
|
| 359 |
-
if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
|
| 360 |
-
float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
|
| 361 |
-
|
| 362 |
-
// Adjust output image if a suitable edge was found.
|
| 363 |
-
if (dc > -eps && dc < 1.f + eps)
|
| 364 |
-
{
|
| 365 |
-
dc = fminf(fmaxf(dc, 0.f), 1.f);
|
| 366 |
-
float alpha = ds * (.5f - dc);
|
| 367 |
-
const float* pColor0 = p.color + pixel0 * p.channels;
|
| 368 |
-
const float* pColor1 = p.color + pixel1 * p.channels;
|
| 369 |
-
float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
|
| 370 |
-
for (int i=0; i < p.channels; i++)
|
| 371 |
-
atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
|
| 372 |
-
|
| 373 |
-
// Rewrite the work item's flags and alpha. Keep original px, py.
|
| 374 |
-
unsigned int flags = pz << 16;
|
| 375 |
-
flags |= di;
|
| 376 |
-
flags |= d << AAWorkItem::FLAG_DOWN_BIT;
|
| 377 |
-
flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
|
| 378 |
-
((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
|
| 379 |
-
}
|
| 380 |
-
}
|
| 381 |
-
}
|
| 382 |
-
}
|
| 383 |
-
|
| 384 |
-
//------------------------------------------------------------------------
|
| 385 |
-
// Gradient kernel.
|
| 386 |
-
|
| 387 |
-
__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
|
| 388 |
-
{
|
| 389 |
-
// Temporary space for coalesced atomics.
|
| 390 |
-
CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
|
| 391 |
-
__shared__ int s_base; // Work counter communication across entire CTA.
|
| 392 |
-
|
| 393 |
-
int workCount = p.workBuffer[0].x;
|
| 394 |
-
|
| 395 |
-
for(;;)
|
| 396 |
-
{
|
| 397 |
-
// Persistent threads work fetcher.
|
| 398 |
-
__syncthreads();
|
| 399 |
-
if (threadIdx.x == 0)
|
| 400 |
-
s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
|
| 401 |
-
__syncthreads();
|
| 402 |
-
int thread_idx = s_base + threadIdx.x;
|
| 403 |
-
if (thread_idx >= workCount)
|
| 404 |
-
return;
|
| 405 |
-
|
| 406 |
-
// Read work item filled out by forward kernel.
|
| 407 |
-
int4 item = p.workBuffer[thread_idx + 1];
|
| 408 |
-
unsigned int amask = __ballot_sync(0xffffffffu, item.w);
|
| 409 |
-
if (item.w == 0)
|
| 410 |
-
continue; // No effect.
|
| 411 |
-
|
| 412 |
-
// Unpack work item and replicate setup from forward analysis kernel.
|
| 413 |
-
int px = item.x;
|
| 414 |
-
int py = item.y;
|
| 415 |
-
int pz = (int)(((unsigned int)item.z) >> 16);
|
| 416 |
-
int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
|
| 417 |
-
float alpha = __int_as_float(item.w);
|
| 418 |
-
int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
|
| 419 |
-
int di = item.z & AAWorkItem::EDGE_MASK;
|
| 420 |
-
float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
|
| 421 |
-
int pixel0 = px + p.width * (py + p.height * pz);
|
| 422 |
-
int pixel1 = pixel0 + (d ? p.width : 1);
|
| 423 |
-
int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
|
| 424 |
-
if (tri1)
|
| 425 |
-
{
|
| 426 |
-
px += 1 - d;
|
| 427 |
-
py += d;
|
| 428 |
-
}
|
| 429 |
-
|
| 430 |
-
// Bail out if triangle index is corrupt.
|
| 431 |
-
bool triFail = (tri < 0 || tri >= p.numTriangles);
|
| 432 |
-
amask = __ballot_sync(amask, !triFail);
|
| 433 |
-
if (triFail)
|
| 434 |
-
continue;
|
| 435 |
-
|
| 436 |
-
// Outgoing color gradients.
|
| 437 |
-
float* pGrad0 = p.gradColor + pixel0 * p.channels;
|
| 438 |
-
float* pGrad1 = p.gradColor + pixel1 * p.channels;
|
| 439 |
-
|
| 440 |
-
// Incoming color gradients.
|
| 441 |
-
const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
|
| 442 |
-
|
| 443 |
-
// Position gradient weight based on colors and incoming gradients.
|
| 444 |
-
float dd = 0.f;
|
| 445 |
-
const float* pColor0 = p.color + pixel0 * p.channels;
|
| 446 |
-
const float* pColor1 = p.color + pixel1 * p.channels;
|
| 447 |
-
|
| 448 |
-
// Loop over channels and accumulate.
|
| 449 |
-
for (int i=0; i < p.channels; i++)
|
| 450 |
-
{
|
| 451 |
-
float dy = pDy[i];
|
| 452 |
-
if (dy != 0.f)
|
| 453 |
-
{
|
| 454 |
-
// Update position gradient weight.
|
| 455 |
-
dd += dy * (pColor1[i] - pColor0[i]);
|
| 456 |
-
|
| 457 |
-
// Update color gradients. No coalescing because all have different targets.
|
| 458 |
-
float v = alpha * dy;
|
| 459 |
-
atomicAdd(&pGrad0[i], -v);
|
| 460 |
-
atomicAdd(&pGrad1[i], v);
|
| 461 |
-
}
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
// If position weight is zero, skip the rest.
|
| 465 |
-
bool noGrad = (dd == 0.f);
|
| 466 |
-
amask = __ballot_sync(amask, !noGrad);
|
| 467 |
-
if (noGrad)
|
| 468 |
-
continue;
|
| 469 |
-
|
| 470 |
-
// Fetch vertex indices of the active edge and their positions.
|
| 471 |
-
int i1 = (di < 2) ? (di + 1) : 0;
|
| 472 |
-
int i2 = (i1 < 2) ? (i1 + 1) : 0;
|
| 473 |
-
int vi1 = p.tri[3 * tri + i1];
|
| 474 |
-
int vi2 = p.tri[3 * tri + i2];
|
| 475 |
-
|
| 476 |
-
// Bail out if vertex indices are corrupt.
|
| 477 |
-
bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
|
| 478 |
-
amask = __ballot_sync(amask, !vtxFail);
|
| 479 |
-
if (vtxFail)
|
| 480 |
-
continue;
|
| 481 |
-
|
| 482 |
-
// Instance mode: Adjust vertex indices based on minibatch index.
|
| 483 |
-
if (p.instance_mode)
|
| 484 |
-
{
|
| 485 |
-
vi1 += pz * p.numVertices;
|
| 486 |
-
vi2 += pz * p.numVertices;
|
| 487 |
-
}
|
| 488 |
-
|
| 489 |
-
// Fetch vertex positions.
|
| 490 |
-
float4 p1 = ((float4*)p.pos)[vi1];
|
| 491 |
-
float4 p2 = ((float4*)p.pos)[vi2];
|
| 492 |
-
|
| 493 |
-
// Project vertices to pixel space.
|
| 494 |
-
float pxh = p.xh;
|
| 495 |
-
float pyh = p.yh;
|
| 496 |
-
float fx = (float)px + .5f - pxh;
|
| 497 |
-
float fy = (float)py + .5f - pyh;
|
| 498 |
-
|
| 499 |
-
// XY flip for horizontal edges.
|
| 500 |
-
if (d)
|
| 501 |
-
{
|
| 502 |
-
swap(p1.x, p1.y);
|
| 503 |
-
swap(p2.x, p2.y);
|
| 504 |
-
swap(pxh, pyh);
|
| 505 |
-
swap(fx, fy);
|
| 506 |
-
}
|
| 507 |
-
|
| 508 |
-
// Gradient calculation setup.
|
| 509 |
-
float w1 = 1.f / p1.w;
|
| 510 |
-
float w2 = 1.f / p2.w;
|
| 511 |
-
float x1 = p1.x * w1 * pxh - fx;
|
| 512 |
-
float y1 = p1.y * w1 * pyh - fy;
|
| 513 |
-
float x2 = p2.x * w2 * pxh - fx;
|
| 514 |
-
float y2 = p2.y * w2 * pyh - fy;
|
| 515 |
-
float dx = x2 - x1;
|
| 516 |
-
float dy = y2 - y1;
|
| 517 |
-
float db = x1*dy - y1*dx;
|
| 518 |
-
|
| 519 |
-
// Compute inverse delta-y with epsilon.
|
| 520 |
-
float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
|
| 521 |
-
float iy = 1.f / (dy + ep);
|
| 522 |
-
|
| 523 |
-
// Compute position gradients.
|
| 524 |
-
float dby = db * iy;
|
| 525 |
-
float iw1 = -w1 * iy * dd;
|
| 526 |
-
float iw2 = w2 * iy * dd;
|
| 527 |
-
float gp1x = iw1 * pxh * y2;
|
| 528 |
-
float gp2x = iw2 * pxh * y1;
|
| 529 |
-
float gp1y = iw1 * pyh * (dby - x2);
|
| 530 |
-
float gp2y = iw2 * pyh * (dby - x1);
|
| 531 |
-
float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
|
| 532 |
-
float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
|
| 533 |
-
|
| 534 |
-
// XY flip the gradients.
|
| 535 |
-
if (d)
|
| 536 |
-
{
|
| 537 |
-
swap(gp1x, gp1y);
|
| 538 |
-
swap(gp2x, gp2y);
|
| 539 |
-
}
|
| 540 |
-
|
| 541 |
-
// Kill position gradients if alpha was saturated.
|
| 542 |
-
if (fabsf(alpha) >= 0.5f)
|
| 543 |
-
{
|
| 544 |
-
gp1x = gp1y = gp1w = 0.f;
|
| 545 |
-
gp2x = gp2y = gp2w = 0.f;
|
| 546 |
-
}
|
| 547 |
-
|
| 548 |
-
// Initialize coalesced atomics. Match both triangle ID and edge index.
|
| 549 |
-
// Also note that some threads may be inactive.
|
| 550 |
-
CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
|
| 551 |
-
|
| 552 |
-
// Accumulate gradients.
|
| 553 |
-
caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
|
| 554 |
-
caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
|
| 555 |
-
}
|
| 556 |
-
}
|
| 557 |
-
|
| 558 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/antialias.h
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include "common.h"
|
| 11 |
-
|
| 12 |
-
//------------------------------------------------------------------------
|
| 13 |
-
// Constants and helpers.
|
| 14 |
-
|
| 15 |
-
#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH 32
|
| 16 |
-
#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT 8
|
| 17 |
-
#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK 256
|
| 18 |
-
#define AA_MESH_KERNEL_THREADS_PER_BLOCK 256
|
| 19 |
-
#define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
|
| 20 |
-
#define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 2 : 3)
|
| 21 |
-
#define AA_GRAD_KERNEL_THREADS_PER_BLOCK 256
|
| 22 |
-
|
| 23 |
-
//------------------------------------------------------------------------
|
| 24 |
-
// CUDA kernel params.
|
| 25 |
-
|
| 26 |
-
struct AntialiasKernelParams
|
| 27 |
-
{
|
| 28 |
-
const float* color; // Incoming color buffer.
|
| 29 |
-
const float* rasterOut; // Incoming rasterizer output buffer.
|
| 30 |
-
const int* tri; // Incoming triangle buffer.
|
| 31 |
-
const float* pos; // Incoming position buffer.
|
| 32 |
-
float* output; // Output buffer of forward kernel.
|
| 33 |
-
const float* dy; // Incoming gradients.
|
| 34 |
-
float* gradColor; // Output buffer, color gradient.
|
| 35 |
-
float* gradPos; // Output buffer, position gradient.
|
| 36 |
-
int4* workBuffer; // Buffer for storing intermediate work items. First item reserved for counters.
|
| 37 |
-
uint4* evHash; // Edge-vertex hash.
|
| 38 |
-
int allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
|
| 39 |
-
int numTriangles; // Number of triangles.
|
| 40 |
-
int numVertices; // Number of vertices.
|
| 41 |
-
int width; // Input width.
|
| 42 |
-
int height; // Input height.
|
| 43 |
-
int n; // Minibatch size.
|
| 44 |
-
int channels; // Channel count in color input.
|
| 45 |
-
float xh, yh; // Transfer to pixel space.
|
| 46 |
-
int instance_mode; // 0=normal, 1=instance mode.
|
| 47 |
-
int tri_const; // 1 if triangle array is known to be constant.
|
| 48 |
-
};
|
| 49 |
-
|
| 50 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/common.cpp
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include <cuda_runtime.h>
|
| 10 |
-
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
// Block and grid size calculators for kernel launches.
|
| 13 |
-
|
| 14 |
-
dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
|
| 15 |
-
{
|
| 16 |
-
int maxThreads = maxWidth * maxHeight;
|
| 17 |
-
if (maxThreads <= 1 || (width * height) <= 1)
|
| 18 |
-
return dim3(1, 1, 1); // Degenerate.
|
| 19 |
-
|
| 20 |
-
// Start from max size.
|
| 21 |
-
int bw = maxWidth;
|
| 22 |
-
int bh = maxHeight;
|
| 23 |
-
|
| 24 |
-
// Optimizations for weirdly sized buffers.
|
| 25 |
-
if (width < bw)
|
| 26 |
-
{
|
| 27 |
-
// Decrease block width to smallest power of two that covers the buffer width.
|
| 28 |
-
while ((bw >> 1) >= width)
|
| 29 |
-
bw >>= 1;
|
| 30 |
-
|
| 31 |
-
// Maximize height.
|
| 32 |
-
bh = maxThreads / bw;
|
| 33 |
-
if (bh > height)
|
| 34 |
-
bh = height;
|
| 35 |
-
}
|
| 36 |
-
else if (height < bh)
|
| 37 |
-
{
|
| 38 |
-
// Halve height and double width until fits completely inside buffer vertically.
|
| 39 |
-
while (bh > height)
|
| 40 |
-
{
|
| 41 |
-
bh >>= 1;
|
| 42 |
-
if (bw < width)
|
| 43 |
-
bw <<= 1;
|
| 44 |
-
}
|
| 45 |
-
}
|
| 46 |
-
|
| 47 |
-
// Done.
|
| 48 |
-
return dim3(bw, bh, 1);
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
|
| 52 |
-
{
|
| 53 |
-
dim3 gridSize;
|
| 54 |
-
gridSize.x = (width - 1) / blockSize.x + 1;
|
| 55 |
-
gridSize.y = (height - 1) / blockSize.y + 1;
|
| 56 |
-
gridSize.z = (depth - 1) / blockSize.z + 1;
|
| 57 |
-
return gridSize;
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/common.h
DELETED
|
@@ -1,263 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include <cuda.h>
|
| 11 |
-
#include <stdint.h>
|
| 12 |
-
|
| 13 |
-
//------------------------------------------------------------------------
|
| 14 |
-
// C++ helper function prototypes.
|
| 15 |
-
|
| 16 |
-
dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
|
| 17 |
-
dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
|
| 18 |
-
|
| 19 |
-
//------------------------------------------------------------------------
|
| 20 |
-
// The rest is CUDA device code specific stuff.
|
| 21 |
-
|
| 22 |
-
#ifdef __CUDACC__
|
| 23 |
-
|
| 24 |
-
//------------------------------------------------------------------------
|
| 25 |
-
// Helpers for CUDA vector types.
|
| 26 |
-
|
| 27 |
-
static __device__ __forceinline__ float2& operator*= (float2& a, const float2& b) { a.x *= b.x; a.y *= b.y; return a; }
|
| 28 |
-
static __device__ __forceinline__ float2& operator+= (float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; }
|
| 29 |
-
static __device__ __forceinline__ float2& operator-= (float2& a, const float2& b) { a.x -= b.x; a.y -= b.y; return a; }
|
| 30 |
-
static __device__ __forceinline__ float2& operator*= (float2& a, float b) { a.x *= b; a.y *= b; return a; }
|
| 31 |
-
static __device__ __forceinline__ float2& operator+= (float2& a, float b) { a.x += b; a.y += b; return a; }
|
| 32 |
-
static __device__ __forceinline__ float2& operator-= (float2& a, float b) { a.x -= b; a.y -= b; return a; }
|
| 33 |
-
static __device__ __forceinline__ float2 operator* (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
|
| 34 |
-
static __device__ __forceinline__ float2 operator+ (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
|
| 35 |
-
static __device__ __forceinline__ float2 operator- (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
|
| 36 |
-
static __device__ __forceinline__ float2 operator* (const float2& a, float b) { return make_float2(a.x * b, a.y * b); }
|
| 37 |
-
static __device__ __forceinline__ float2 operator+ (const float2& a, float b) { return make_float2(a.x + b, a.y + b); }
|
| 38 |
-
static __device__ __forceinline__ float2 operator- (const float2& a, float b) { return make_float2(a.x - b, a.y - b); }
|
| 39 |
-
static __device__ __forceinline__ float2 operator* (float a, const float2& b) { return make_float2(a * b.x, a * b.y); }
|
| 40 |
-
static __device__ __forceinline__ float2 operator+ (float a, const float2& b) { return make_float2(a + b.x, a + b.y); }
|
| 41 |
-
static __device__ __forceinline__ float2 operator- (float a, const float2& b) { return make_float2(a - b.x, a - b.y); }
|
| 42 |
-
static __device__ __forceinline__ float2 operator- (const float2& a) { return make_float2(-a.x, -a.y); }
|
| 43 |
-
static __device__ __forceinline__ float3& operator*= (float3& a, const float3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
|
| 44 |
-
static __device__ __forceinline__ float3& operator+= (float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
|
| 45 |
-
static __device__ __forceinline__ float3& operator-= (float3& a, const float3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
|
| 46 |
-
static __device__ __forceinline__ float3& operator*= (float3& a, float b) { a.x *= b; a.y *= b; a.z *= b; return a; }
|
| 47 |
-
static __device__ __forceinline__ float3& operator+= (float3& a, float b) { a.x += b; a.y += b; a.z += b; return a; }
|
| 48 |
-
static __device__ __forceinline__ float3& operator-= (float3& a, float b) { a.x -= b; a.y -= b; a.z -= b; return a; }
|
| 49 |
-
static __device__ __forceinline__ float3 operator* (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
|
| 50 |
-
static __device__ __forceinline__ float3 operator+ (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
|
| 51 |
-
static __device__ __forceinline__ float3 operator- (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
|
| 52 |
-
static __device__ __forceinline__ float3 operator* (const float3& a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); }
|
| 53 |
-
static __device__ __forceinline__ float3 operator+ (const float3& a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); }
|
| 54 |
-
static __device__ __forceinline__ float3 operator- (const float3& a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); }
|
| 55 |
-
static __device__ __forceinline__ float3 operator* (float a, const float3& b) { return make_float3(a * b.x, a * b.y, a * b.z); }
|
| 56 |
-
static __device__ __forceinline__ float3 operator+ (float a, const float3& b) { return make_float3(a + b.x, a + b.y, a + b.z); }
|
| 57 |
-
static __device__ __forceinline__ float3 operator- (float a, const float3& b) { return make_float3(a - b.x, a - b.y, a - b.z); }
|
| 58 |
-
static __device__ __forceinline__ float3 operator- (const float3& a) { return make_float3(-a.x, -a.y, -a.z); }
|
| 59 |
-
static __device__ __forceinline__ float4& operator*= (float4& a, const float4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
|
| 60 |
-
static __device__ __forceinline__ float4& operator+= (float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
|
| 61 |
-
static __device__ __forceinline__ float4& operator-= (float4& a, const float4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
|
| 62 |
-
static __device__ __forceinline__ float4& operator*= (float4& a, float b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
|
| 63 |
-
static __device__ __forceinline__ float4& operator+= (float4& a, float b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
|
| 64 |
-
static __device__ __forceinline__ float4& operator-= (float4& a, float b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
|
| 65 |
-
static __device__ __forceinline__ float4 operator* (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
|
| 66 |
-
static __device__ __forceinline__ float4 operator+ (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
|
| 67 |
-
static __device__ __forceinline__ float4 operator- (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
|
| 68 |
-
static __device__ __forceinline__ float4 operator* (const float4& a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
|
| 69 |
-
static __device__ __forceinline__ float4 operator+ (const float4& a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
|
| 70 |
-
static __device__ __forceinline__ float4 operator- (const float4& a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
|
| 71 |
-
static __device__ __forceinline__ float4 operator* (float a, const float4& b) { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
|
| 72 |
-
static __device__ __forceinline__ float4 operator+ (float a, const float4& b) { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
|
| 73 |
-
static __device__ __forceinline__ float4 operator- (float a, const float4& b) { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
|
| 74 |
-
static __device__ __forceinline__ float4 operator- (const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); }
|
| 75 |
-
static __device__ __forceinline__ int2& operator*= (int2& a, const int2& b) { a.x *= b.x; a.y *= b.y; return a; }
|
| 76 |
-
static __device__ __forceinline__ int2& operator+= (int2& a, const int2& b) { a.x += b.x; a.y += b.y; return a; }
|
| 77 |
-
static __device__ __forceinline__ int2& operator-= (int2& a, const int2& b) { a.x -= b.x; a.y -= b.y; return a; }
|
| 78 |
-
static __device__ __forceinline__ int2& operator*= (int2& a, int b) { a.x *= b; a.y *= b; return a; }
|
| 79 |
-
static __device__ __forceinline__ int2& operator+= (int2& a, int b) { a.x += b; a.y += b; return a; }
|
| 80 |
-
static __device__ __forceinline__ int2& operator-= (int2& a, int b) { a.x -= b; a.y -= b; return a; }
|
| 81 |
-
static __device__ __forceinline__ int2 operator* (const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); }
|
| 82 |
-
static __device__ __forceinline__ int2 operator+ (const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); }
|
| 83 |
-
static __device__ __forceinline__ int2 operator- (const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); }
|
| 84 |
-
static __device__ __forceinline__ int2 operator* (const int2& a, int b) { return make_int2(a.x * b, a.y * b); }
|
| 85 |
-
static __device__ __forceinline__ int2 operator+ (const int2& a, int b) { return make_int2(a.x + b, a.y + b); }
|
| 86 |
-
static __device__ __forceinline__ int2 operator- (const int2& a, int b) { return make_int2(a.x - b, a.y - b); }
|
| 87 |
-
static __device__ __forceinline__ int2 operator* (int a, const int2& b) { return make_int2(a * b.x, a * b.y); }
|
| 88 |
-
static __device__ __forceinline__ int2 operator+ (int a, const int2& b) { return make_int2(a + b.x, a + b.y); }
|
| 89 |
-
static __device__ __forceinline__ int2 operator- (int a, const int2& b) { return make_int2(a - b.x, a - b.y); }
|
| 90 |
-
static __device__ __forceinline__ int2 operator- (const int2& a) { return make_int2(-a.x, -a.y); }
|
| 91 |
-
static __device__ __forceinline__ int3& operator*= (int3& a, const int3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
|
| 92 |
-
static __device__ __forceinline__ int3& operator+= (int3& a, const int3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
|
| 93 |
-
static __device__ __forceinline__ int3& operator-= (int3& a, const int3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
|
| 94 |
-
static __device__ __forceinline__ int3& operator*= (int3& a, int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
|
| 95 |
-
static __device__ __forceinline__ int3& operator+= (int3& a, int b) { a.x += b; a.y += b; a.z += b; return a; }
|
| 96 |
-
static __device__ __forceinline__ int3& operator-= (int3& a, int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
|
| 97 |
-
static __device__ __forceinline__ int3 operator* (const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
|
| 98 |
-
static __device__ __forceinline__ int3 operator+ (const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
|
| 99 |
-
static __device__ __forceinline__ int3 operator- (const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
|
| 100 |
-
static __device__ __forceinline__ int3 operator* (const int3& a, int b) { return make_int3(a.x * b, a.y * b, a.z * b); }
|
| 101 |
-
static __device__ __forceinline__ int3 operator+ (const int3& a, int b) { return make_int3(a.x + b, a.y + b, a.z + b); }
|
| 102 |
-
static __device__ __forceinline__ int3 operator- (const int3& a, int b) { return make_int3(a.x - b, a.y - b, a.z - b); }
|
| 103 |
-
static __device__ __forceinline__ int3 operator* (int a, const int3& b) { return make_int3(a * b.x, a * b.y, a * b.z); }
|
| 104 |
-
static __device__ __forceinline__ int3 operator+ (int a, const int3& b) { return make_int3(a + b.x, a + b.y, a + b.z); }
|
| 105 |
-
static __device__ __forceinline__ int3 operator- (int a, const int3& b) { return make_int3(a - b.x, a - b.y, a - b.z); }
|
| 106 |
-
static __device__ __forceinline__ int3 operator- (const int3& a) { return make_int3(-a.x, -a.y, -a.z); }
|
| 107 |
-
static __device__ __forceinline__ int4& operator*= (int4& a, const int4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
|
| 108 |
-
static __device__ __forceinline__ int4& operator+= (int4& a, const int4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
|
| 109 |
-
static __device__ __forceinline__ int4& operator-= (int4& a, const int4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
|
| 110 |
-
static __device__ __forceinline__ int4& operator*= (int4& a, int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
|
| 111 |
-
static __device__ __forceinline__ int4& operator+= (int4& a, int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
|
| 112 |
-
static __device__ __forceinline__ int4& operator-= (int4& a, int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
|
| 113 |
-
static __device__ __forceinline__ int4 operator* (const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
|
| 114 |
-
static __device__ __forceinline__ int4 operator+ (const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
|
| 115 |
-
static __device__ __forceinline__ int4 operator- (const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
|
| 116 |
-
static __device__ __forceinline__ int4 operator* (const int4& a, int b) { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
|
| 117 |
-
static __device__ __forceinline__ int4 operator+ (const int4& a, int b) { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
|
| 118 |
-
static __device__ __forceinline__ int4 operator- (const int4& a, int b) { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
|
| 119 |
-
static __device__ __forceinline__ int4 operator* (int a, const int4& b) { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
|
| 120 |
-
static __device__ __forceinline__ int4 operator+ (int a, const int4& b) { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
|
| 121 |
-
static __device__ __forceinline__ int4 operator- (int a, const int4& b) { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
|
| 122 |
-
static __device__ __forceinline__ int4 operator- (const int4& a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
|
| 123 |
-
static __device__ __forceinline__ uint2& operator*= (uint2& a, const uint2& b) { a.x *= b.x; a.y *= b.y; return a; }
|
| 124 |
-
static __device__ __forceinline__ uint2& operator+= (uint2& a, const uint2& b) { a.x += b.x; a.y += b.y; return a; }
|
| 125 |
-
static __device__ __forceinline__ uint2& operator-= (uint2& a, const uint2& b) { a.x -= b.x; a.y -= b.y; return a; }
|
| 126 |
-
static __device__ __forceinline__ uint2& operator*= (uint2& a, unsigned int b) { a.x *= b; a.y *= b; return a; }
|
| 127 |
-
static __device__ __forceinline__ uint2& operator+= (uint2& a, unsigned int b) { a.x += b; a.y += b; return a; }
|
| 128 |
-
static __device__ __forceinline__ uint2& operator-= (uint2& a, unsigned int b) { a.x -= b; a.y -= b; return a; }
|
| 129 |
-
static __device__ __forceinline__ uint2 operator* (const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); }
|
| 130 |
-
static __device__ __forceinline__ uint2 operator+ (const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); }
|
| 131 |
-
static __device__ __forceinline__ uint2 operator- (const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); }
|
| 132 |
-
static __device__ __forceinline__ uint2 operator* (const uint2& a, unsigned int b) { return make_uint2(a.x * b, a.y * b); }
|
| 133 |
-
static __device__ __forceinline__ uint2 operator+ (const uint2& a, unsigned int b) { return make_uint2(a.x + b, a.y + b); }
|
| 134 |
-
static __device__ __forceinline__ uint2 operator- (const uint2& a, unsigned int b) { return make_uint2(a.x - b, a.y - b); }
|
| 135 |
-
static __device__ __forceinline__ uint2 operator* (unsigned int a, const uint2& b) { return make_uint2(a * b.x, a * b.y); }
|
| 136 |
-
static __device__ __forceinline__ uint2 operator+ (unsigned int a, const uint2& b) { return make_uint2(a + b.x, a + b.y); }
|
| 137 |
-
static __device__ __forceinline__ uint2 operator- (unsigned int a, const uint2& b) { return make_uint2(a - b.x, a - b.y); }
|
| 138 |
-
static __device__ __forceinline__ uint3& operator*= (uint3& a, const uint3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
|
| 139 |
-
static __device__ __forceinline__ uint3& operator+= (uint3& a, const uint3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
|
| 140 |
-
static __device__ __forceinline__ uint3& operator-= (uint3& a, const uint3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
|
| 141 |
-
static __device__ __forceinline__ uint3& operator*= (uint3& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
|
| 142 |
-
static __device__ __forceinline__ uint3& operator+= (uint3& a, unsigned int b) { a.x += b; a.y += b; a.z += b; return a; }
|
| 143 |
-
static __device__ __forceinline__ uint3& operator-= (uint3& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
|
| 144 |
-
static __device__ __forceinline__ uint3 operator* (const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
|
| 145 |
-
static __device__ __forceinline__ uint3 operator+ (const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
|
| 146 |
-
static __device__ __forceinline__ uint3 operator- (const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
|
| 147 |
-
static __device__ __forceinline__ uint3 operator* (const uint3& a, unsigned int b) { return make_uint3(a.x * b, a.y * b, a.z * b); }
|
| 148 |
-
static __device__ __forceinline__ uint3 operator+ (const uint3& a, unsigned int b) { return make_uint3(a.x + b, a.y + b, a.z + b); }
|
| 149 |
-
static __device__ __forceinline__ uint3 operator- (const uint3& a, unsigned int b) { return make_uint3(a.x - b, a.y - b, a.z - b); }
|
| 150 |
-
static __device__ __forceinline__ uint3 operator* (unsigned int a, const uint3& b) { return make_uint3(a * b.x, a * b.y, a * b.z); }
|
| 151 |
-
static __device__ __forceinline__ uint3 operator+ (unsigned int a, const uint3& b) { return make_uint3(a + b.x, a + b.y, a + b.z); }
|
| 152 |
-
static __device__ __forceinline__ uint3 operator- (unsigned int a, const uint3& b) { return make_uint3(a - b.x, a - b.y, a - b.z); }
|
| 153 |
-
static __device__ __forceinline__ uint4& operator*= (uint4& a, const uint4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
|
| 154 |
-
static __device__ __forceinline__ uint4& operator+= (uint4& a, const uint4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
|
| 155 |
-
static __device__ __forceinline__ uint4& operator-= (uint4& a, const uint4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
|
| 156 |
-
static __device__ __forceinline__ uint4& operator*= (uint4& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
|
| 157 |
-
static __device__ __forceinline__ uint4& operator+= (uint4& a, unsigned int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
|
| 158 |
-
static __device__ __forceinline__ uint4& operator-= (uint4& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
|
| 159 |
-
static __device__ __forceinline__ uint4 operator* (const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
|
| 160 |
-
static __device__ __forceinline__ uint4 operator+ (const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
|
| 161 |
-
static __device__ __forceinline__ uint4 operator- (const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
|
| 162 |
-
static __device__ __forceinline__ uint4 operator* (const uint4& a, unsigned int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
|
| 163 |
-
static __device__ __forceinline__ uint4 operator+ (const uint4& a, unsigned int b) { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
|
| 164 |
-
static __device__ __forceinline__ uint4 operator- (const uint4& a, unsigned int b) { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
|
| 165 |
-
static __device__ __forceinline__ uint4 operator* (unsigned int a, const uint4& b) { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
|
| 166 |
-
static __device__ __forceinline__ uint4 operator+ (unsigned int a, const uint4& b) { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
|
| 167 |
-
static __device__ __forceinline__ uint4 operator- (unsigned int a, const uint4& b) { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
|
| 168 |
-
|
| 169 |
-
template<class T> static __device__ __forceinline__ T zero_value(void);
|
| 170 |
-
template<> __device__ __forceinline__ float zero_value<float> (void) { return 0.f; }
|
| 171 |
-
template<> __device__ __forceinline__ float2 zero_value<float2>(void) { return make_float2(0.f, 0.f); }
|
| 172 |
-
template<> __device__ __forceinline__ float4 zero_value<float4>(void) { return make_float4(0.f, 0.f, 0.f, 0.f); }
|
| 173 |
-
static __device__ __forceinline__ float3 make_float3(const float2& a, float b) { return make_float3(a.x, a.y, b); }
|
| 174 |
-
static __device__ __forceinline__ float4 make_float4(const float3& a, float b) { return make_float4(a.x, a.y, a.z, b); }
|
| 175 |
-
static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b) { return make_float4(a.x, a.y, b.x, b.y); }
|
| 176 |
-
static __device__ __forceinline__ int3 make_int3(const int2& a, int b) { return make_int3(a.x, a.y, b); }
|
| 177 |
-
static __device__ __forceinline__ int4 make_int4(const int3& a, int b) { return make_int4(a.x, a.y, a.z, b); }
|
| 178 |
-
static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b) { return make_int4(a.x, a.y, b.x, b.y); }
|
| 179 |
-
static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b) { return make_uint3(a.x, a.y, b); }
|
| 180 |
-
static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b) { return make_uint4(a.x, a.y, a.z, b); }
|
| 181 |
-
static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b) { return make_uint4(a.x, a.y, b.x, b.y); }
|
| 182 |
-
|
| 183 |
-
template<class T> static __device__ __forceinline__ void swap(T& a, T& b) { T temp = a; a = b; b = temp; }
|
| 184 |
-
|
| 185 |
-
//------------------------------------------------------------------------
|
| 186 |
-
// Triangle ID <-> float32 conversion functions to support very large triangle IDs.
|
| 187 |
-
//
|
| 188 |
-
// Values up to and including 16777216 (also, negative values) are converted trivially and retain
|
| 189 |
-
// compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
|
| 190 |
-
// the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
|
| 191 |
-
|
| 192 |
-
static __device__ __forceinline__ int float_to_triidx(float x) { if (x <= 16777216.f) return (int)x; return __float_as_int(x) - 0x4a800000; }
|
| 193 |
-
static __device__ __forceinline__ float triidx_to_float(int x) { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
|
| 194 |
-
|
| 195 |
-
//------------------------------------------------------------------------
|
| 196 |
-
// Coalesced atomics. These are all done via macros.
|
| 197 |
-
|
| 198 |
-
#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
|
| 199 |
-
|
| 200 |
-
#define CA_TEMP _ca_temp
|
| 201 |
-
#define CA_TEMP_PARAM float* CA_TEMP
|
| 202 |
-
#define CA_DECLARE_TEMP(threads_per_block) \
|
| 203 |
-
__shared__ float CA_TEMP[(threads_per_block)]
|
| 204 |
-
|
| 205 |
-
#define CA_SET_GROUP_MASK(group, thread_mask) \
|
| 206 |
-
bool _ca_leader; \
|
| 207 |
-
float* _ca_ptr; \
|
| 208 |
-
do { \
|
| 209 |
-
int tidx = threadIdx.x + blockDim.x * threadIdx.y; \
|
| 210 |
-
int lane = tidx & 31; \
|
| 211 |
-
int warp = tidx >> 5; \
|
| 212 |
-
int tmask = __match_any_sync((thread_mask), (group)); \
|
| 213 |
-
int leader = __ffs(tmask) - 1; \
|
| 214 |
-
_ca_leader = (leader == lane); \
|
| 215 |
-
_ca_ptr = &_ca_temp[((warp << 5) + leader)]; \
|
| 216 |
-
} while(0)
|
| 217 |
-
|
| 218 |
-
#define CA_SET_GROUP(group) \
|
| 219 |
-
CA_SET_GROUP_MASK((group), 0xffffffffu)
|
| 220 |
-
|
| 221 |
-
#define caAtomicAdd(ptr, value) \
|
| 222 |
-
do { \
|
| 223 |
-
if (_ca_leader) \
|
| 224 |
-
*_ca_ptr = 0.f; \
|
| 225 |
-
atomicAdd(_ca_ptr, (value)); \
|
| 226 |
-
if (_ca_leader) \
|
| 227 |
-
atomicAdd((ptr), *_ca_ptr); \
|
| 228 |
-
} while(0)
|
| 229 |
-
|
| 230 |
-
#define caAtomicAdd3_xyw(ptr, x, y, w) \
|
| 231 |
-
do { \
|
| 232 |
-
caAtomicAdd((ptr), (x)); \
|
| 233 |
-
caAtomicAdd((ptr)+1, (y)); \
|
| 234 |
-
caAtomicAdd((ptr)+3, (w)); \
|
| 235 |
-
} while(0)
|
| 236 |
-
|
| 237 |
-
#define caAtomicAddTexture(ptr, level, idx, value) \
|
| 238 |
-
do { \
|
| 239 |
-
CA_SET_GROUP((idx) ^ ((level) << 27)); \
|
| 240 |
-
caAtomicAdd((ptr)+(idx), (value)); \
|
| 241 |
-
} while(0)
|
| 242 |
-
|
| 243 |
-
//------------------------------------------------------------------------
|
| 244 |
-
// Disable atomic coalescing for compute capability lower than 7.x
|
| 245 |
-
|
| 246 |
-
#else // __CUDA_ARCH__ >= 700
|
| 247 |
-
#define CA_TEMP _ca_temp
|
| 248 |
-
#define CA_TEMP_PARAM float CA_TEMP
|
| 249 |
-
#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
|
| 250 |
-
#define CA_SET_GROUP_MASK(group, thread_mask)
|
| 251 |
-
#define CA_SET_GROUP(group)
|
| 252 |
-
#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
|
| 253 |
-
#define caAtomicAdd3_xyw(ptr, x, y, w) \
|
| 254 |
-
do { \
|
| 255 |
-
atomicAdd((ptr), (x)); \
|
| 256 |
-
atomicAdd((ptr)+1, (y)); \
|
| 257 |
-
atomicAdd((ptr)+3, (w)); \
|
| 258 |
-
} while(0)
|
| 259 |
-
#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
|
| 260 |
-
#endif // __CUDA_ARCH__ >= 700
|
| 261 |
-
|
| 262 |
-
//------------------------------------------------------------------------
|
| 263 |
-
#endif // __CUDACC__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
// This is a slimmed-down and modernized version of the original
|
| 13 |
-
// CudaRaster codebase that accompanied the HPG 2011 paper
|
| 14 |
-
// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
|
| 15 |
-
// Modifications have been made to accommodate post-Volta execution model
|
| 16 |
-
// with warp divergence. Support for shading, blending, quad rendering,
|
| 17 |
-
// and supersampling have been removed as unnecessary for nvdiffrast.
|
| 18 |
-
//------------------------------------------------------------------------
|
| 19 |
-
|
| 20 |
-
namespace CR
|
| 21 |
-
{
|
| 22 |
-
|
| 23 |
-
class RasterImpl;
|
| 24 |
-
|
| 25 |
-
//------------------------------------------------------------------------
|
| 26 |
-
// Interface class to isolate user from implementation details.
|
| 27 |
-
//------------------------------------------------------------------------
|
| 28 |
-
|
| 29 |
-
class CudaRaster
|
| 30 |
-
{
|
| 31 |
-
public:
|
| 32 |
-
enum
|
| 33 |
-
{
|
| 34 |
-
RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
|
| 35 |
-
RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
|
| 36 |
-
};
|
| 37 |
-
|
| 38 |
-
public:
|
| 39 |
-
CudaRaster (void);
|
| 40 |
-
~CudaRaster (void);
|
| 41 |
-
|
| 42 |
-
void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
|
| 43 |
-
void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
|
| 44 |
-
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
|
| 45 |
-
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
|
| 46 |
-
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
|
| 47 |
-
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
|
| 48 |
-
bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
|
| 49 |
-
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
|
| 50 |
-
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
|
| 51 |
-
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
|
| 52 |
-
|
| 53 |
-
private:
|
| 54 |
-
CudaRaster (const CudaRaster&); // forbidden
|
| 55 |
-
CudaRaster& operator= (const CudaRaster&); // forbidden
|
| 56 |
-
|
| 57 |
-
private:
|
| 58 |
-
RasterImpl* m_impl; // Opaque pointer to implementation.
|
| 59 |
-
};
|
| 60 |
-
|
| 61 |
-
//------------------------------------------------------------------------
|
| 62 |
-
} // namespace CR
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl
DELETED
|
@@ -1,423 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
//------------------------------------------------------------------------
|
| 10 |
-
|
| 11 |
-
__device__ __inline__ void binRasterImpl(const CRParams p)
|
| 12 |
-
{
|
| 13 |
-
__shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
|
| 14 |
-
__shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
|
| 15 |
-
__shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
|
| 16 |
-
__shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
|
| 17 |
-
__shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
| 18 |
-
__shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
| 19 |
-
__shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
|
| 20 |
-
__shared__ volatile U32 s_batchPos;
|
| 21 |
-
__shared__ volatile U32 s_bufCount;
|
| 22 |
-
__shared__ volatile U32 s_overTotal;
|
| 23 |
-
__shared__ volatile U32 s_allocBase;
|
| 24 |
-
|
| 25 |
-
const CRImageParams& ip = getImageParams(p, blockIdx.z);
|
| 26 |
-
CRAtomics& atomics = p.atomics[blockIdx.z];
|
| 27 |
-
const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
|
| 28 |
-
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
| 29 |
-
|
| 30 |
-
S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
| 31 |
-
S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
| 32 |
-
S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
| 33 |
-
S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
| 34 |
-
S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
| 35 |
-
|
| 36 |
-
if (atomics.numSubtris > p.maxSubtris)
|
| 37 |
-
return;
|
| 38 |
-
|
| 39 |
-
// per-thread state
|
| 40 |
-
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
| 41 |
-
int batchPos = 0;
|
| 42 |
-
|
| 43 |
-
// first 16 elements of s_broadcast are always zero
|
| 44 |
-
if (thrInBlock < 16)
|
| 45 |
-
s_broadcast[thrInBlock] = 0;
|
| 46 |
-
|
| 47 |
-
// initialize output linked lists and offsets
|
| 48 |
-
if (thrInBlock < p.numBins)
|
| 49 |
-
{
|
| 50 |
-
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
|
| 51 |
-
s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
|
| 52 |
-
s_outTotal[thrInBlock] = 0;
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
// repeat until done
|
| 56 |
-
for(;;)
|
| 57 |
-
{
|
| 58 |
-
// get batch
|
| 59 |
-
if (thrInBlock == 0)
|
| 60 |
-
s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
|
| 61 |
-
__syncthreads();
|
| 62 |
-
batchPos = s_batchPos;
|
| 63 |
-
|
| 64 |
-
// all batches done?
|
| 65 |
-
if (batchPos >= ip.triCount)
|
| 66 |
-
break;
|
| 67 |
-
|
| 68 |
-
// per-thread state
|
| 69 |
-
int bufIndex = 0;
|
| 70 |
-
int bufCount = 0;
|
| 71 |
-
int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
|
| 72 |
-
|
| 73 |
-
// loop over batch as long as we have triangles in it
|
| 74 |
-
do
|
| 75 |
-
{
|
| 76 |
-
// read more triangles
|
| 77 |
-
while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
|
| 78 |
-
{
|
| 79 |
-
// get subtriangle count
|
| 80 |
-
|
| 81 |
-
int triIdx = batchPos + thrInBlock;
|
| 82 |
-
int num = 0;
|
| 83 |
-
if (triIdx < batchEnd)
|
| 84 |
-
num = triSubtris[triIdx];
|
| 85 |
-
|
| 86 |
-
// cumulative sum of subtriangles within each warp
|
| 87 |
-
U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
|
| 88 |
-
if (__any_sync(~0u, num > 1))
|
| 89 |
-
{
|
| 90 |
-
myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
|
| 91 |
-
myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
|
| 92 |
-
}
|
| 93 |
-
if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
|
| 94 |
-
s_broadcast[threadIdx.y + 16] = myIdx + num;
|
| 95 |
-
__syncthreads();
|
| 96 |
-
|
| 97 |
-
// cumulative sum of per-warp subtriangle counts
|
| 98 |
-
// Note: cannot have more than 32 warps or this needs to sync between each step.
|
| 99 |
-
bool act = (thrInBlock < CR_BIN_WARPS);
|
| 100 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 101 |
-
if (threadIdx.y == 0 && act)
|
| 102 |
-
{
|
| 103 |
-
volatile U32* ptr = &s_broadcast[thrInBlock + 16];
|
| 104 |
-
U32 val = *ptr;
|
| 105 |
-
#if (CR_BIN_WARPS > 1)
|
| 106 |
-
val += ptr[-1]; __syncwarp(actMask);
|
| 107 |
-
*ptr = val; __syncwarp(actMask);
|
| 108 |
-
#endif
|
| 109 |
-
#if (CR_BIN_WARPS > 2)
|
| 110 |
-
val += ptr[-2]; __syncwarp(actMask);
|
| 111 |
-
*ptr = val; __syncwarp(actMask);
|
| 112 |
-
#endif
|
| 113 |
-
#if (CR_BIN_WARPS > 4)
|
| 114 |
-
val += ptr[-4]; __syncwarp(actMask);
|
| 115 |
-
*ptr = val; __syncwarp(actMask);
|
| 116 |
-
#endif
|
| 117 |
-
#if (CR_BIN_WARPS > 8)
|
| 118 |
-
val += ptr[-8]; __syncwarp(actMask);
|
| 119 |
-
*ptr = val; __syncwarp(actMask);
|
| 120 |
-
#endif
|
| 121 |
-
#if (CR_BIN_WARPS > 16)
|
| 122 |
-
val += ptr[-16]; __syncwarp(actMask);
|
| 123 |
-
*ptr = val; __syncwarp(actMask);
|
| 124 |
-
#endif
|
| 125 |
-
|
| 126 |
-
// initially assume that we consume everything
|
| 127 |
-
// only last active thread does the writes
|
| 128 |
-
if (threadIdx.x == CR_BIN_WARPS - 1)
|
| 129 |
-
{
|
| 130 |
-
s_batchPos = batchPos + CR_BIN_WARPS * 32;
|
| 131 |
-
s_bufCount = bufCount + val;
|
| 132 |
-
}
|
| 133 |
-
}
|
| 134 |
-
__syncthreads();
|
| 135 |
-
|
| 136 |
-
// skip if no subtriangles
|
| 137 |
-
if (num)
|
| 138 |
-
{
|
| 139 |
-
// calculate write position for first subtriangle
|
| 140 |
-
U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
|
| 141 |
-
|
| 142 |
-
// only write if entire triangle fits
|
| 143 |
-
if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
|
| 144 |
-
{
|
| 145 |
-
pos += bufIndex; // adjust for current start position
|
| 146 |
-
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
| 147 |
-
if (num == 1)
|
| 148 |
-
s_triBuf[pos] = triIdx * 8 + 7; // single triangle
|
| 149 |
-
else
|
| 150 |
-
{
|
| 151 |
-
for (int i=0; i < num; i++)
|
| 152 |
-
{
|
| 153 |
-
s_triBuf[pos] = triIdx * 8 + i;
|
| 154 |
-
pos++;
|
| 155 |
-
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
| 156 |
-
}
|
| 157 |
-
}
|
| 158 |
-
} else if (pos <= CR_ARRAY_SIZE(s_triBuf))
|
| 159 |
-
{
|
| 160 |
-
// this triangle is the first that failed, overwrite total count and triangle count
|
| 161 |
-
s_batchPos = batchPos + thrInBlock;
|
| 162 |
-
s_bufCount = pos;
|
| 163 |
-
}
|
| 164 |
-
}
|
| 165 |
-
|
| 166 |
-
// update triangle counts
|
| 167 |
-
__syncthreads();
|
| 168 |
-
batchPos = s_batchPos;
|
| 169 |
-
bufCount = s_bufCount;
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
// make every warp clear its output buffers
|
| 173 |
-
for (int i=threadIdx.x; i < p.numBins; i += 32)
|
| 174 |
-
s_outMask[threadIdx.y][i] = 0;
|
| 175 |
-
__syncwarp();
|
| 176 |
-
|
| 177 |
-
// choose our triangle
|
| 178 |
-
uint4 triData = make_uint4(0, 0, 0, 0);
|
| 179 |
-
if (thrInBlock < bufCount)
|
| 180 |
-
{
|
| 181 |
-
U32 triPos = bufIndex + thrInBlock;
|
| 182 |
-
triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
| 183 |
-
|
| 184 |
-
// find triangle
|
| 185 |
-
int triIdx = s_triBuf[triPos];
|
| 186 |
-
int dataIdx = triIdx >> 3;
|
| 187 |
-
int subtriIdx = triIdx & 7;
|
| 188 |
-
if (subtriIdx != 7)
|
| 189 |
-
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
| 190 |
-
|
| 191 |
-
// read triangle
|
| 192 |
-
|
| 193 |
-
triData = *(((const uint4*)triHeader) + dataIdx);
|
| 194 |
-
}
|
| 195 |
-
|
| 196 |
-
// setup bounding box and edge functions, and rasterize
|
| 197 |
-
S32 lox, loy, hix, hiy;
|
| 198 |
-
bool hasTri = (thrInBlock < bufCount);
|
| 199 |
-
U32 hasTriMask = __ballot_sync(~0u, hasTri);
|
| 200 |
-
if (hasTri)
|
| 201 |
-
{
|
| 202 |
-
S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
| 203 |
-
S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
| 204 |
-
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
| 205 |
-
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
| 206 |
-
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
| 207 |
-
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
| 208 |
-
int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
| 209 |
-
lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
| 210 |
-
loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
| 211 |
-
hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
| 212 |
-
hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
| 213 |
-
|
| 214 |
-
U32 bit = 1 << threadIdx.x;
|
| 215 |
-
#if __CUDA_ARCH__ >= 700
|
| 216 |
-
bool multi = (hix != lox || hiy != loy);
|
| 217 |
-
if (!__any_sync(hasTriMask, multi))
|
| 218 |
-
{
|
| 219 |
-
int binIdx = lox + p.widthBins * loy;
|
| 220 |
-
U32 mask = __match_any_sync(hasTriMask, binIdx);
|
| 221 |
-
s_outMask[threadIdx.y][binIdx] = mask;
|
| 222 |
-
__syncwarp(hasTriMask);
|
| 223 |
-
} else
|
| 224 |
-
#endif
|
| 225 |
-
{
|
| 226 |
-
bool complex = (hix > lox+1 || hiy > loy+1);
|
| 227 |
-
if (!__any_sync(hasTriMask, complex))
|
| 228 |
-
{
|
| 229 |
-
int binIdx = lox + p.widthBins * loy;
|
| 230 |
-
atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
|
| 231 |
-
if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
|
| 232 |
-
if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
|
| 233 |
-
if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
|
| 234 |
-
} else
|
| 235 |
-
{
|
| 236 |
-
S32 d12x = d02x - d01x, d12y = d02y - d01y;
|
| 237 |
-
v0x -= lox << binLog, v0y -= loy << binLog;
|
| 238 |
-
|
| 239 |
-
S32 t01 = v0x * d01y - v0y * d01x;
|
| 240 |
-
S32 t02 = v0y * d02x - v0x * d02y;
|
| 241 |
-
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
| 242 |
-
S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
|
| 243 |
-
S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
|
| 244 |
-
S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
|
| 245 |
-
|
| 246 |
-
int width = hix - lox + 1;
|
| 247 |
-
d01x += width * d01y;
|
| 248 |
-
d02x += width * d02y;
|
| 249 |
-
d12x += width * d12y;
|
| 250 |
-
|
| 251 |
-
U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
|
| 252 |
-
U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
|
| 253 |
-
U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
|
| 254 |
-
int stride = p.widthBins * 4;
|
| 255 |
-
int ptrYInc = stride - width * 4;
|
| 256 |
-
|
| 257 |
-
do
|
| 258 |
-
{
|
| 259 |
-
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
| 260 |
-
atomicOr((U32*)currPtr, bit);
|
| 261 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
| 262 |
-
if (currPtr == skipPtr)
|
| 263 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
|
| 264 |
-
}
|
| 265 |
-
while (currPtr != endPtr);
|
| 266 |
-
}
|
| 267 |
-
}
|
| 268 |
-
}
|
| 269 |
-
|
| 270 |
-
// count per-bin contributions
|
| 271 |
-
if (thrInBlock == 0)
|
| 272 |
-
s_overTotal = 0; // overflow counter
|
| 273 |
-
|
| 274 |
-
// ensure that out masks are done
|
| 275 |
-
__syncthreads();
|
| 276 |
-
|
| 277 |
-
int overIndex = -1;
|
| 278 |
-
bool act = (thrInBlock < p.numBins);
|
| 279 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 280 |
-
if (act)
|
| 281 |
-
{
|
| 282 |
-
U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
|
| 283 |
-
U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
|
| 284 |
-
int total = 0;
|
| 285 |
-
for (int i = 0; i < CR_BIN_WARPS; i++)
|
| 286 |
-
{
|
| 287 |
-
total += __popc(*(U32*)srcPtr);
|
| 288 |
-
*(U32*)dstPtr = total;
|
| 289 |
-
srcPtr += (CR_MAXBINS_SQR + 1) * 4;
|
| 290 |
-
dstPtr += (CR_MAXBINS_SQR + 1) * 4;
|
| 291 |
-
}
|
| 292 |
-
|
| 293 |
-
// overflow => request a new segment
|
| 294 |
-
int ofs = s_outOfs[thrInBlock];
|
| 295 |
-
bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
|
| 296 |
-
U32 ovrMask = __ballot_sync(actMask, ovr);
|
| 297 |
-
if (ovr)
|
| 298 |
-
{
|
| 299 |
-
overIndex = __popc(ovrMask & getLaneMaskLt());
|
| 300 |
-
if (overIndex == 0)
|
| 301 |
-
s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
|
| 302 |
-
__syncwarp(ovrMask);
|
| 303 |
-
overIndex += s_broadcast[threadIdx.y + 16];
|
| 304 |
-
s_overIndex[thrInBlock] = overIndex;
|
| 305 |
-
}
|
| 306 |
-
}
|
| 307 |
-
|
| 308 |
-
// sync after overTotal is ready
|
| 309 |
-
__syncthreads();
|
| 310 |
-
|
| 311 |
-
// at least one segment overflowed => allocate segments
|
| 312 |
-
U32 overTotal = s_overTotal;
|
| 313 |
-
U32 allocBase = 0;
|
| 314 |
-
if (overTotal > 0)
|
| 315 |
-
{
|
| 316 |
-
// allocate memory
|
| 317 |
-
if (thrInBlock == 0)
|
| 318 |
-
{
|
| 319 |
-
U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
|
| 320 |
-
s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
|
| 321 |
-
}
|
| 322 |
-
__syncthreads();
|
| 323 |
-
allocBase = s_allocBase;
|
| 324 |
-
|
| 325 |
-
// did my bin overflow?
|
| 326 |
-
if (overIndex != -1)
|
| 327 |
-
{
|
| 328 |
-
// calculate new segment index
|
| 329 |
-
int segIdx = allocBase + overIndex;
|
| 330 |
-
|
| 331 |
-
// add to linked list
|
| 332 |
-
if (s_outOfs[thrInBlock] < 0)
|
| 333 |
-
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
|
| 334 |
-
else
|
| 335 |
-
binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
|
| 336 |
-
|
| 337 |
-
// defaults
|
| 338 |
-
binSegNext [segIdx] = -1;
|
| 339 |
-
binSegCount[segIdx] = CR_BIN_SEG_SIZE;
|
| 340 |
-
}
|
| 341 |
-
}
|
| 342 |
-
|
| 343 |
-
// concurrent emission -- each warp handles its own triangle
|
| 344 |
-
if (thrInBlock < bufCount)
|
| 345 |
-
{
|
| 346 |
-
int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
|
| 347 |
-
int currBin = lox + loy * p.widthBins;
|
| 348 |
-
int skipBin = (hix + 1) + loy * p.widthBins;
|
| 349 |
-
int endBin = lox + (hiy + 1) * p.widthBins;
|
| 350 |
-
int binYInc = p.widthBins - (hix - lox + 1);
|
| 351 |
-
|
| 352 |
-
// loop over triangle's bins
|
| 353 |
-
do
|
| 354 |
-
{
|
| 355 |
-
U32 outMask = s_outMask[threadIdx.y][currBin];
|
| 356 |
-
if (outMask & (1<<threadIdx.x))
|
| 357 |
-
{
|
| 358 |
-
int idx = __popc(outMask & getLaneMaskLt());
|
| 359 |
-
if (threadIdx.y > 0)
|
| 360 |
-
idx += s_outCount[threadIdx.y-1][currBin];
|
| 361 |
-
|
| 362 |
-
int base = s_outOfs[currBin];
|
| 363 |
-
int free = (-base) & (CR_BIN_SEG_SIZE - 1);
|
| 364 |
-
if (idx >= free)
|
| 365 |
-
idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
|
| 366 |
-
else
|
| 367 |
-
idx += base;
|
| 368 |
-
|
| 369 |
-
binSegData[idx] = s_triBuf[triPos];
|
| 370 |
-
}
|
| 371 |
-
|
| 372 |
-
currBin++;
|
| 373 |
-
if (currBin == skipBin)
|
| 374 |
-
currBin += binYInc, skipBin += p.widthBins;
|
| 375 |
-
}
|
| 376 |
-
while (currBin != endBin);
|
| 377 |
-
}
|
| 378 |
-
|
| 379 |
-
// wait all triangles to finish, then replace overflown segment offsets
|
| 380 |
-
__syncthreads();
|
| 381 |
-
if (thrInBlock < p.numBins)
|
| 382 |
-
{
|
| 383 |
-
U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
|
| 384 |
-
U32 oldOfs = s_outOfs[thrInBlock];
|
| 385 |
-
if (overIndex == -1)
|
| 386 |
-
s_outOfs[thrInBlock] = oldOfs + total;
|
| 387 |
-
else
|
| 388 |
-
{
|
| 389 |
-
int addr = oldOfs + total;
|
| 390 |
-
addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
|
| 391 |
-
addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
|
| 392 |
-
s_outOfs[thrInBlock] = addr;
|
| 393 |
-
}
|
| 394 |
-
s_outTotal[thrInBlock] += total;
|
| 395 |
-
}
|
| 396 |
-
|
| 397 |
-
// these triangles are now done
|
| 398 |
-
int count = ::min(bufCount, CR_BIN_WARPS * 32);
|
| 399 |
-
bufCount -= count;
|
| 400 |
-
bufIndex += count;
|
| 401 |
-
bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
|
| 402 |
-
}
|
| 403 |
-
while (bufCount > 0 || batchPos < batchEnd);
|
| 404 |
-
|
| 405 |
-
// flush all bins
|
| 406 |
-
if (thrInBlock < p.numBins)
|
| 407 |
-
{
|
| 408 |
-
int ofs = s_outOfs[thrInBlock];
|
| 409 |
-
if (ofs & (CR_BIN_SEG_SIZE-1))
|
| 410 |
-
{
|
| 411 |
-
int seg = ofs >> CR_BIN_SEG_LOG2;
|
| 412 |
-
binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
|
| 413 |
-
s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
|
| 414 |
-
}
|
| 415 |
-
}
|
| 416 |
-
}
|
| 417 |
-
|
| 418 |
-
// output totals
|
| 419 |
-
if (thrInBlock < p.numBins)
|
| 420 |
-
binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
|
| 421 |
-
}
|
| 422 |
-
|
| 423 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp
DELETED
|
@@ -1,94 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "../../framework.h"
|
| 10 |
-
#include "Buffer.hpp"
|
| 11 |
-
|
| 12 |
-
using namespace CR;
|
| 13 |
-
|
| 14 |
-
//------------------------------------------------------------------------
|
| 15 |
-
// GPU buffer.
|
| 16 |
-
//------------------------------------------------------------------------
|
| 17 |
-
|
| 18 |
-
Buffer::Buffer(void)
|
| 19 |
-
: m_gpuPtr(NULL),
|
| 20 |
-
m_bytes (0)
|
| 21 |
-
{
|
| 22 |
-
// empty
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
Buffer::~Buffer(void)
|
| 26 |
-
{
|
| 27 |
-
if (m_gpuPtr)
|
| 28 |
-
cudaFree(m_gpuPtr); // Don't throw an exception.
|
| 29 |
-
}
|
| 30 |
-
|
| 31 |
-
void Buffer::reset(size_t bytes)
|
| 32 |
-
{
|
| 33 |
-
if (bytes == m_bytes)
|
| 34 |
-
return;
|
| 35 |
-
|
| 36 |
-
if (m_gpuPtr)
|
| 37 |
-
{
|
| 38 |
-
NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
|
| 39 |
-
m_gpuPtr = NULL;
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
if (bytes > 0)
|
| 43 |
-
NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
|
| 44 |
-
|
| 45 |
-
m_bytes = bytes;
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
void Buffer::grow(size_t bytes)
|
| 49 |
-
{
|
| 50 |
-
if (bytes > m_bytes)
|
| 51 |
-
reset(bytes);
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
//------------------------------------------------------------------------
|
| 55 |
-
// Host buffer with page-locked memory.
|
| 56 |
-
//------------------------------------------------------------------------
|
| 57 |
-
|
| 58 |
-
HostBuffer::HostBuffer(void)
|
| 59 |
-
: m_hostPtr(NULL),
|
| 60 |
-
m_bytes (0)
|
| 61 |
-
{
|
| 62 |
-
// empty
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
HostBuffer::~HostBuffer(void)
|
| 66 |
-
{
|
| 67 |
-
if (m_hostPtr)
|
| 68 |
-
cudaFreeHost(m_hostPtr); // Don't throw an exception.
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
void HostBuffer::reset(size_t bytes)
|
| 72 |
-
{
|
| 73 |
-
if (bytes == m_bytes)
|
| 74 |
-
return;
|
| 75 |
-
|
| 76 |
-
if (m_hostPtr)
|
| 77 |
-
{
|
| 78 |
-
NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
|
| 79 |
-
m_hostPtr = NULL;
|
| 80 |
-
}
|
| 81 |
-
|
| 82 |
-
if (bytes > 0)
|
| 83 |
-
NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
|
| 84 |
-
|
| 85 |
-
m_bytes = bytes;
|
| 86 |
-
}
|
| 87 |
-
|
| 88 |
-
void HostBuffer::grow(size_t bytes)
|
| 89 |
-
{
|
| 90 |
-
if (bytes > m_bytes)
|
| 91 |
-
reset(bytes);
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include "Defs.hpp"
|
| 11 |
-
|
| 12 |
-
namespace CR
|
| 13 |
-
{
|
| 14 |
-
//------------------------------------------------------------------------
|
| 15 |
-
|
| 16 |
-
class Buffer
|
| 17 |
-
{
|
| 18 |
-
public:
|
| 19 |
-
Buffer (void);
|
| 20 |
-
~Buffer (void);
|
| 21 |
-
|
| 22 |
-
void reset (size_t bytes);
|
| 23 |
-
void grow (size_t bytes);
|
| 24 |
-
void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
|
| 25 |
-
size_t getSize (void) const { return m_bytes; }
|
| 26 |
-
|
| 27 |
-
void setPtr (void* ptr) { m_gpuPtr = ptr; }
|
| 28 |
-
|
| 29 |
-
private:
|
| 30 |
-
void* m_gpuPtr;
|
| 31 |
-
size_t m_bytes;
|
| 32 |
-
};
|
| 33 |
-
|
| 34 |
-
//------------------------------------------------------------------------
|
| 35 |
-
|
| 36 |
-
class HostBuffer
|
| 37 |
-
{
|
| 38 |
-
public:
|
| 39 |
-
HostBuffer (void);
|
| 40 |
-
~HostBuffer (void);
|
| 41 |
-
|
| 42 |
-
void reset (size_t bytes);
|
| 43 |
-
void grow (size_t bytes);
|
| 44 |
-
void* getPtr (void) { return m_hostPtr; }
|
| 45 |
-
size_t getSize (void) const { return m_bytes; }
|
| 46 |
-
|
| 47 |
-
void setPtr (void* ptr) { m_hostPtr = ptr; }
|
| 48 |
-
|
| 49 |
-
private:
|
| 50 |
-
void* m_hostPtr;
|
| 51 |
-
size_t m_bytes;
|
| 52 |
-
};
|
| 53 |
-
|
| 54 |
-
//------------------------------------------------------------------------
|
| 55 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
DELETED
|
@@ -1,730 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
//------------------------------------------------------------------------
|
| 10 |
-
|
| 11 |
-
__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
|
| 12 |
-
{
|
| 13 |
-
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
| 14 |
-
int tileY = tileInBin >> CR_BIN_LOG2;
|
| 15 |
-
return tileX + tileY * widthTiles;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
//------------------------------------------------------------------------
|
| 19 |
-
|
| 20 |
-
__device__ __inline__ void coarseRasterImpl(const CRParams p)
|
| 21 |
-
{
|
| 22 |
-
// Common.
|
| 23 |
-
|
| 24 |
-
__shared__ volatile U32 s_workCounter;
|
| 25 |
-
__shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
|
| 26 |
-
|
| 27 |
-
// Input.
|
| 28 |
-
|
| 29 |
-
__shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
|
| 30 |
-
__shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
|
| 31 |
-
__shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
|
| 32 |
-
__shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
|
| 33 |
-
__shared__ volatile S32 s_triQueueWritePos;
|
| 34 |
-
__shared__ volatile U32 s_binStreamSelectedOfs;
|
| 35 |
-
__shared__ volatile U32 s_binStreamSelectedSize;
|
| 36 |
-
|
| 37 |
-
// Output.
|
| 38 |
-
|
| 39 |
-
__shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
| 40 |
-
__shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
| 41 |
-
__shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
| 42 |
-
__shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
| 43 |
-
__shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
|
| 44 |
-
__shared__ volatile U32 s_firstAllocSeg;
|
| 45 |
-
__shared__ volatile U32 s_firstActiveIdx;
|
| 46 |
-
|
| 47 |
-
// Pointers and constants.
|
| 48 |
-
|
| 49 |
-
CRAtomics& atomics = p.atomics[blockIdx.z];
|
| 50 |
-
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
| 51 |
-
const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
| 52 |
-
const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
| 53 |
-
const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
| 54 |
-
const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
| 55 |
-
const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
| 56 |
-
S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
|
| 57 |
-
S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
|
| 58 |
-
S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
|
| 59 |
-
S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
|
| 60 |
-
S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
|
| 61 |
-
|
| 62 |
-
int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
| 63 |
-
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
| 64 |
-
int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
|
| 65 |
-
|
| 66 |
-
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
|
| 67 |
-
return;
|
| 68 |
-
|
| 69 |
-
// Initialize sharedmem arrays.
|
| 70 |
-
|
| 71 |
-
if (thrInBlock == 0)
|
| 72 |
-
{
|
| 73 |
-
s_tileEmitPrefixSum[0] = 0;
|
| 74 |
-
s_tileAllocPrefixSum[0] = 0;
|
| 75 |
-
}
|
| 76 |
-
s_scanTemp[threadIdx.y][threadIdx.x] = 0;
|
| 77 |
-
|
| 78 |
-
// Sort bins in descending order of triangle count.
|
| 79 |
-
|
| 80 |
-
for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
|
| 81 |
-
{
|
| 82 |
-
int count = 0;
|
| 83 |
-
for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
|
| 84 |
-
count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
|
| 85 |
-
s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
|
| 86 |
-
}
|
| 87 |
-
|
| 88 |
-
__syncthreads();
|
| 89 |
-
sortShared(s_binOrder, p.numBins);
|
| 90 |
-
|
| 91 |
-
// Process each bin by one block.
|
| 92 |
-
|
| 93 |
-
for (;;)
|
| 94 |
-
{
|
| 95 |
-
// Pick a bin for the block.
|
| 96 |
-
|
| 97 |
-
if (thrInBlock == 0)
|
| 98 |
-
s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
|
| 99 |
-
__syncthreads();
|
| 100 |
-
|
| 101 |
-
int workCounter = s_workCounter;
|
| 102 |
-
if (workCounter >= p.numBins)
|
| 103 |
-
break;
|
| 104 |
-
|
| 105 |
-
U32 binOrder = s_binOrder[workCounter];
|
| 106 |
-
bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
|
| 107 |
-
if (binEmpty && !p.deferredClear)
|
| 108 |
-
break;
|
| 109 |
-
|
| 110 |
-
int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
|
| 111 |
-
|
| 112 |
-
// Initialize input/output streams.
|
| 113 |
-
|
| 114 |
-
int triQueueWritePos = 0;
|
| 115 |
-
int triQueueReadPos = 0;
|
| 116 |
-
|
| 117 |
-
if (thrInBlock < CR_BIN_STREAMS_SIZE)
|
| 118 |
-
{
|
| 119 |
-
int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
|
| 120 |
-
s_binStreamCurrSeg[thrInBlock] = segIdx;
|
| 121 |
-
s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
| 125 |
-
s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
|
| 126 |
-
|
| 127 |
-
// Initialize per-bin state.
|
| 128 |
-
|
| 129 |
-
int binY = idiv_fast(binIdx, p.widthBins);
|
| 130 |
-
int binX = binIdx - binY * p.widthBins;
|
| 131 |
-
int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 132 |
-
int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 133 |
-
int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
| 134 |
-
int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
| 135 |
-
int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
|
| 136 |
-
|
| 137 |
-
// Entire block: Merge input streams and process triangles.
|
| 138 |
-
|
| 139 |
-
if (!binEmpty)
|
| 140 |
-
do
|
| 141 |
-
{
|
| 142 |
-
//------------------------------------------------------------------------
|
| 143 |
-
// Merge.
|
| 144 |
-
//------------------------------------------------------------------------
|
| 145 |
-
|
| 146 |
-
// Entire block: Not enough triangles => merge and queue segments.
|
| 147 |
-
// NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
|
| 148 |
-
|
| 149 |
-
while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
|
| 150 |
-
{
|
| 151 |
-
// First warp: Choose the segment with the lowest initial triangle index.
|
| 152 |
-
|
| 153 |
-
bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
|
| 154 |
-
U32 hasStreamMask = __ballot_sync(~0u, hasStream);
|
| 155 |
-
if (hasStream)
|
| 156 |
-
{
|
| 157 |
-
// Find the stream with the lowest triangle index.
|
| 158 |
-
|
| 159 |
-
U32 firstTri = s_binStreamFirstTri[thrInBlock];
|
| 160 |
-
U32 t = firstTri;
|
| 161 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
| 162 |
-
|
| 163 |
-
#if (CR_BIN_STREAMS_SIZE > 1)
|
| 164 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
|
| 165 |
-
#endif
|
| 166 |
-
#if (CR_BIN_STREAMS_SIZE > 2)
|
| 167 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
|
| 168 |
-
#endif
|
| 169 |
-
#if (CR_BIN_STREAMS_SIZE > 4)
|
| 170 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
|
| 171 |
-
#endif
|
| 172 |
-
#if (CR_BIN_STREAMS_SIZE > 8)
|
| 173 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
|
| 174 |
-
#endif
|
| 175 |
-
#if (CR_BIN_STREAMS_SIZE > 16)
|
| 176 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
|
| 177 |
-
#endif
|
| 178 |
-
v[0] = t; __syncwarp(hasStreamMask);
|
| 179 |
-
|
| 180 |
-
// Consume and broadcast.
|
| 181 |
-
|
| 182 |
-
bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
|
| 183 |
-
U32 firstMask = __ballot_sync(hasStreamMask, first);
|
| 184 |
-
if (first && (firstMask >> threadIdx.x) == 1u)
|
| 185 |
-
{
|
| 186 |
-
int segIdx = s_binStreamCurrSeg[thrInBlock];
|
| 187 |
-
s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
|
| 188 |
-
if (segIdx != -1)
|
| 189 |
-
{
|
| 190 |
-
int segSize = binSegCount[segIdx];
|
| 191 |
-
int segNext = binSegNext[segIdx];
|
| 192 |
-
s_binStreamSelectedSize = segSize;
|
| 193 |
-
s_triQueueWritePos = triQueueWritePos + segSize;
|
| 194 |
-
s_binStreamCurrSeg[thrInBlock] = segNext;
|
| 195 |
-
s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
|
| 196 |
-
}
|
| 197 |
-
}
|
| 198 |
-
}
|
| 199 |
-
|
| 200 |
-
// No more segments => break.
|
| 201 |
-
|
| 202 |
-
__syncthreads();
|
| 203 |
-
triQueueWritePos = s_triQueueWritePos;
|
| 204 |
-
int segOfs = s_binStreamSelectedOfs;
|
| 205 |
-
if (segOfs < 0)
|
| 206 |
-
break;
|
| 207 |
-
|
| 208 |
-
int segSize = s_binStreamSelectedSize;
|
| 209 |
-
__syncthreads();
|
| 210 |
-
|
| 211 |
-
// Fetch triangles into the queue.
|
| 212 |
-
|
| 213 |
-
for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
|
| 214 |
-
{
|
| 215 |
-
S32 triIdx = binSegData[segOfs + idxInSeg];
|
| 216 |
-
s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
|
| 217 |
-
}
|
| 218 |
-
}
|
| 219 |
-
|
| 220 |
-
// All threads: Clear emit masks.
|
| 221 |
-
|
| 222 |
-
for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
|
| 223 |
-
s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
|
| 224 |
-
|
| 225 |
-
__syncthreads();
|
| 226 |
-
|
| 227 |
-
//------------------------------------------------------------------------
|
| 228 |
-
// Raster.
|
| 229 |
-
//------------------------------------------------------------------------
|
| 230 |
-
|
| 231 |
-
// Triangle per thread: Read from the queue.
|
| 232 |
-
|
| 233 |
-
int triIdx = -1;
|
| 234 |
-
if (triQueueReadPos + thrInBlock < triQueueWritePos)
|
| 235 |
-
triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
|
| 236 |
-
|
| 237 |
-
uint4 triData = make_uint4(0, 0, 0, 0);
|
| 238 |
-
if (triIdx != -1)
|
| 239 |
-
{
|
| 240 |
-
int dataIdx = triIdx >> 3;
|
| 241 |
-
int subtriIdx = triIdx & 7;
|
| 242 |
-
if (subtriIdx != 7)
|
| 243 |
-
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
| 244 |
-
triData = *((uint4*)triHeader + dataIdx);
|
| 245 |
-
}
|
| 246 |
-
|
| 247 |
-
// 32 triangles per warp: Record emits (= tile intersections).
|
| 248 |
-
|
| 249 |
-
if (__any_sync(~0u, triIdx != -1))
|
| 250 |
-
{
|
| 251 |
-
S32 v0x = sub_s16lo_s16lo(triData.x, originX);
|
| 252 |
-
S32 v0y = sub_s16hi_s16lo(triData.x, originY);
|
| 253 |
-
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
| 254 |
-
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
| 255 |
-
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
| 256 |
-
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
| 257 |
-
|
| 258 |
-
// Compute tile-based AABB.
|
| 259 |
-
|
| 260 |
-
int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
| 261 |
-
int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
| 262 |
-
int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
| 263 |
-
int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
| 264 |
-
int sizex = add_sub(hix, 1, lox);
|
| 265 |
-
int sizey = add_sub(hiy, 1, loy);
|
| 266 |
-
int area = sizex * sizey;
|
| 267 |
-
|
| 268 |
-
// Miscellaneous init.
|
| 269 |
-
|
| 270 |
-
U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
|
| 271 |
-
int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
|
| 272 |
-
U32 maskBit = 1 << threadIdx.x;
|
| 273 |
-
|
| 274 |
-
// Case A: All AABBs are small => record the full AABB using atomics.
|
| 275 |
-
|
| 276 |
-
if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
|
| 277 |
-
{
|
| 278 |
-
if (triIdx != -1)
|
| 279 |
-
{
|
| 280 |
-
atomicOr((U32*)currPtr, maskBit);
|
| 281 |
-
if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
|
| 282 |
-
if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
|
| 283 |
-
if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
|
| 284 |
-
}
|
| 285 |
-
}
|
| 286 |
-
else
|
| 287 |
-
{
|
| 288 |
-
// Compute warp-AABB (scan-32).
|
| 289 |
-
|
| 290 |
-
U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
|
| 291 |
-
if (triIdx == -1)
|
| 292 |
-
aabbMask = 0;
|
| 293 |
-
|
| 294 |
-
volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
|
| 295 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
|
| 296 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
|
| 297 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
|
| 298 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
|
| 299 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
|
| 300 |
-
v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
|
| 301 |
-
|
| 302 |
-
U32 maskX = aabbMask & 0xFFFF;
|
| 303 |
-
U32 maskY = aabbMask >> 16;
|
| 304 |
-
int wlox = findLeadingOne(maskX ^ (maskX - 1));
|
| 305 |
-
int wloy = findLeadingOne(maskY ^ (maskY - 1));
|
| 306 |
-
int whix = findLeadingOne(maskX);
|
| 307 |
-
int whiy = findLeadingOne(maskY);
|
| 308 |
-
int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
|
| 309 |
-
|
| 310 |
-
// Initialize edge functions.
|
| 311 |
-
|
| 312 |
-
S32 d12x = d02x - d01x;
|
| 313 |
-
S32 d12y = d02y - d01y;
|
| 314 |
-
v0x -= lox << tileLog;
|
| 315 |
-
v0y -= loy << tileLog;
|
| 316 |
-
|
| 317 |
-
S32 t01 = v0x * d01y - v0y * d01x;
|
| 318 |
-
S32 t02 = v0y * d02x - v0x * d02y;
|
| 319 |
-
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
| 320 |
-
S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
|
| 321 |
-
S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
|
| 322 |
-
S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
|
| 323 |
-
|
| 324 |
-
d01x += sizex * d01y;
|
| 325 |
-
d02x += sizex * d02y;
|
| 326 |
-
d12x += sizex * d12y;
|
| 327 |
-
|
| 328 |
-
// Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
|
| 329 |
-
if (__any_sync(~0u, warea * 4 <= area * 8))
|
| 330 |
-
{
|
| 331 |
-
// Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
|
| 332 |
-
bool act = (triIdx != -1);
|
| 333 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 334 |
-
if (act)
|
| 335 |
-
{
|
| 336 |
-
for (int y = wloy; y <= whiy; y++)
|
| 337 |
-
{
|
| 338 |
-
bool yIn = (y >= loy && y <= hiy);
|
| 339 |
-
U32 yMask = __ballot_sync(actMask, yIn);
|
| 340 |
-
if (yIn)
|
| 341 |
-
{
|
| 342 |
-
for (int x = wlox; x <= whix; x++)
|
| 343 |
-
{
|
| 344 |
-
bool xyIn = (x >= lox && x <= hix);
|
| 345 |
-
U32 xyMask = __ballot_sync(yMask, xyIn);
|
| 346 |
-
if (xyIn)
|
| 347 |
-
{
|
| 348 |
-
U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
|
| 349 |
-
if (threadIdx.x == 31 - __clz(xyMask))
|
| 350 |
-
*(U32*)currPtr = res;
|
| 351 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
| 352 |
-
}
|
| 353 |
-
}
|
| 354 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
|
| 355 |
-
}
|
| 356 |
-
}
|
| 357 |
-
}
|
| 358 |
-
}
|
| 359 |
-
|
| 360 |
-
// Case C: General case => Check tiles in AABB, record using atomics.
|
| 361 |
-
|
| 362 |
-
else
|
| 363 |
-
{
|
| 364 |
-
if (triIdx != -1)
|
| 365 |
-
{
|
| 366 |
-
U8* skipPtr = currPtr + (sizex << 2);
|
| 367 |
-
U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
|
| 368 |
-
do
|
| 369 |
-
{
|
| 370 |
-
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
| 371 |
-
atomicOr((U32*)currPtr, maskBit);
|
| 372 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
| 373 |
-
if (currPtr == skipPtr)
|
| 374 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
|
| 375 |
-
}
|
| 376 |
-
while (currPtr != endPtr);
|
| 377 |
-
}
|
| 378 |
-
}
|
| 379 |
-
}
|
| 380 |
-
}
|
| 381 |
-
|
| 382 |
-
__syncthreads();
|
| 383 |
-
|
| 384 |
-
//------------------------------------------------------------------------
|
| 385 |
-
// Count.
|
| 386 |
-
//------------------------------------------------------------------------
|
| 387 |
-
|
| 388 |
-
// Tile per thread: Initialize prefix sums.
|
| 389 |
-
|
| 390 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
| 391 |
-
{
|
| 392 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
| 393 |
-
bool act = (tileInBin < CR_BIN_SQR);
|
| 394 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 395 |
-
if (act)
|
| 396 |
-
{
|
| 397 |
-
// Compute prefix sum of emits over warps.
|
| 398 |
-
|
| 399 |
-
U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
|
| 400 |
-
U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
|
| 401 |
-
int tileEmits = 0;
|
| 402 |
-
for (int i = 0; i < CR_COARSE_WARPS; i++)
|
| 403 |
-
{
|
| 404 |
-
tileEmits += __popc(*(U32*)srcPtr);
|
| 405 |
-
*(U32*)dstPtr = tileEmits;
|
| 406 |
-
srcPtr += (CR_BIN_SQR + 1) * 4;
|
| 407 |
-
dstPtr += (CR_BIN_SQR + 1) * 4;
|
| 408 |
-
}
|
| 409 |
-
|
| 410 |
-
// Determine the number of segments to allocate.
|
| 411 |
-
|
| 412 |
-
int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
|
| 413 |
-
int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
|
| 414 |
-
volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
|
| 415 |
-
|
| 416 |
-
// All counters within the warp are small => compute prefix sum using ballot.
|
| 417 |
-
|
| 418 |
-
if (!__any_sync(actMask, tileEmits >= 2))
|
| 419 |
-
{
|
| 420 |
-
U32 m = getLaneMaskLe();
|
| 421 |
-
*v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
|
| 422 |
-
}
|
| 423 |
-
|
| 424 |
-
// Otherwise => scan-32 within the warp.
|
| 425 |
-
|
| 426 |
-
else
|
| 427 |
-
{
|
| 428 |
-
U32 sum = (tileEmits << emitShift) | tileAllocs;
|
| 429 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
|
| 430 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
|
| 431 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
|
| 432 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
|
| 433 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
|
| 434 |
-
*v = sum; __syncwarp(actMask);
|
| 435 |
-
}
|
| 436 |
-
}
|
| 437 |
-
}
|
| 438 |
-
|
| 439 |
-
// First warp: Scan-8.
|
| 440 |
-
|
| 441 |
-
__syncthreads();
|
| 442 |
-
|
| 443 |
-
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
| 444 |
-
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
| 445 |
-
if (scan8)
|
| 446 |
-
{
|
| 447 |
-
int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
|
| 448 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
| 449 |
-
v[0] = sum; __syncwarp(scan8Mask);
|
| 450 |
-
#if (CR_BIN_SQR > 1 * 32)
|
| 451 |
-
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 452 |
-
#endif
|
| 453 |
-
#if (CR_BIN_SQR > 2 * 32)
|
| 454 |
-
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 455 |
-
#endif
|
| 456 |
-
#if (CR_BIN_SQR > 4 * 32)
|
| 457 |
-
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 458 |
-
#endif
|
| 459 |
-
}
|
| 460 |
-
|
| 461 |
-
__syncthreads();
|
| 462 |
-
|
| 463 |
-
// Tile per thread: Finalize prefix sums.
|
| 464 |
-
// Single thread: Allocate segments.
|
| 465 |
-
|
| 466 |
-
for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
| 467 |
-
{
|
| 468 |
-
int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
|
| 469 |
-
int numEmits = sum >> emitShift;
|
| 470 |
-
int numAllocs = sum & ((1 << emitShift) - 1);
|
| 471 |
-
s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
|
| 472 |
-
s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
|
| 473 |
-
|
| 474 |
-
if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
|
| 475 |
-
{
|
| 476 |
-
int t = atomicAdd(&atomics.numTileSegs, numAllocs);
|
| 477 |
-
s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
|
| 478 |
-
}
|
| 479 |
-
}
|
| 480 |
-
|
| 481 |
-
__syncthreads();
|
| 482 |
-
int firstAllocSeg = s_firstAllocSeg;
|
| 483 |
-
int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
|
| 484 |
-
int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
|
| 485 |
-
|
| 486 |
-
//------------------------------------------------------------------------
|
| 487 |
-
// Emit.
|
| 488 |
-
//------------------------------------------------------------------------
|
| 489 |
-
|
| 490 |
-
// Emit per thread: Write triangle index to globalmem.
|
| 491 |
-
|
| 492 |
-
for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
|
| 493 |
-
{
|
| 494 |
-
// Find tile in bin.
|
| 495 |
-
|
| 496 |
-
U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
|
| 497 |
-
U8* tilePtr = tileBase;
|
| 498 |
-
U8* ptr;
|
| 499 |
-
|
| 500 |
-
#if (CR_BIN_SQR > 128)
|
| 501 |
-
ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 502 |
-
#endif
|
| 503 |
-
#if (CR_BIN_SQR > 64)
|
| 504 |
-
ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 505 |
-
#endif
|
| 506 |
-
#if (CR_BIN_SQR > 32)
|
| 507 |
-
ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 508 |
-
#endif
|
| 509 |
-
#if (CR_BIN_SQR > 16)
|
| 510 |
-
ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 511 |
-
#endif
|
| 512 |
-
#if (CR_BIN_SQR > 8)
|
| 513 |
-
ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 514 |
-
#endif
|
| 515 |
-
#if (CR_BIN_SQR > 4)
|
| 516 |
-
ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 517 |
-
#endif
|
| 518 |
-
#if (CR_BIN_SQR > 2)
|
| 519 |
-
ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 520 |
-
#endif
|
| 521 |
-
#if (CR_BIN_SQR > 1)
|
| 522 |
-
ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
| 523 |
-
#endif
|
| 524 |
-
|
| 525 |
-
int tileInBin = (tilePtr - tileBase) >> 2;
|
| 526 |
-
int emitInTile = emitInBin - *(U32*)tilePtr;
|
| 527 |
-
|
| 528 |
-
// Find warp in tile.
|
| 529 |
-
|
| 530 |
-
int warpStep = (CR_BIN_SQR + 1) * 4;
|
| 531 |
-
U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
|
| 532 |
-
U8* warpPtr = warpBase;
|
| 533 |
-
|
| 534 |
-
#if (CR_COARSE_WARPS > 8)
|
| 535 |
-
ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
| 536 |
-
#endif
|
| 537 |
-
#if (CR_COARSE_WARPS > 4)
|
| 538 |
-
ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
| 539 |
-
#endif
|
| 540 |
-
#if (CR_COARSE_WARPS > 2)
|
| 541 |
-
ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
| 542 |
-
#endif
|
| 543 |
-
#if (CR_COARSE_WARPS > 1)
|
| 544 |
-
ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
| 545 |
-
#endif
|
| 546 |
-
|
| 547 |
-
int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
|
| 548 |
-
U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
|
| 549 |
-
int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
|
| 550 |
-
|
| 551 |
-
// Find thread in warp.
|
| 552 |
-
|
| 553 |
-
int threadInWarp = 0;
|
| 554 |
-
int pop = __popc(emitMask & 0xFFFF);
|
| 555 |
-
bool pred = (emitInWarp >= pop);
|
| 556 |
-
if (pred) emitInWarp -= pop;
|
| 557 |
-
if (pred) emitMask >>= 0x10;
|
| 558 |
-
if (pred) threadInWarp += 0x10;
|
| 559 |
-
|
| 560 |
-
pop = __popc(emitMask & 0xFF);
|
| 561 |
-
pred = (emitInWarp >= pop);
|
| 562 |
-
if (pred) emitInWarp -= pop;
|
| 563 |
-
if (pred) emitMask >>= 0x08;
|
| 564 |
-
if (pred) threadInWarp += 0x08;
|
| 565 |
-
|
| 566 |
-
pop = __popc(emitMask & 0xF);
|
| 567 |
-
pred = (emitInWarp >= pop);
|
| 568 |
-
if (pred) emitInWarp -= pop;
|
| 569 |
-
if (pred) emitMask >>= 0x04;
|
| 570 |
-
if (pred) threadInWarp += 0x04;
|
| 571 |
-
|
| 572 |
-
pop = __popc(emitMask & 0x3);
|
| 573 |
-
pred = (emitInWarp >= pop);
|
| 574 |
-
if (pred) emitInWarp -= pop;
|
| 575 |
-
if (pred) emitMask >>= 0x02;
|
| 576 |
-
if (pred) threadInWarp += 0x02;
|
| 577 |
-
|
| 578 |
-
if (emitInWarp >= (emitMask & 1))
|
| 579 |
-
threadInWarp++;
|
| 580 |
-
|
| 581 |
-
// Figure out where to write.
|
| 582 |
-
|
| 583 |
-
int currOfs = s_tileStreamCurrOfs[tileInBin];
|
| 584 |
-
int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
|
| 585 |
-
int outOfs = emitInTile;
|
| 586 |
-
|
| 587 |
-
if (outOfs < spaceLeft)
|
| 588 |
-
outOfs += currOfs;
|
| 589 |
-
else
|
| 590 |
-
{
|
| 591 |
-
int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
|
| 592 |
-
outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
|
| 593 |
-
}
|
| 594 |
-
|
| 595 |
-
// Write.
|
| 596 |
-
|
| 597 |
-
int queueIdx = warpInTile * 32 + threadInWarp;
|
| 598 |
-
int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
|
| 599 |
-
|
| 600 |
-
tileSegData[outOfs] = triIdx;
|
| 601 |
-
}
|
| 602 |
-
|
| 603 |
-
//------------------------------------------------------------------------
|
| 604 |
-
// Patch.
|
| 605 |
-
//------------------------------------------------------------------------
|
| 606 |
-
|
| 607 |
-
// Allocated segment per thread: Initialize next-pointer and count.
|
| 608 |
-
|
| 609 |
-
for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
|
| 610 |
-
{
|
| 611 |
-
int segIdx = firstAllocSeg + i;
|
| 612 |
-
tileSegNext[segIdx] = segIdx + 1;
|
| 613 |
-
tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
|
| 614 |
-
}
|
| 615 |
-
|
| 616 |
-
// Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
|
| 617 |
-
|
| 618 |
-
__syncthreads();
|
| 619 |
-
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
| 620 |
-
{
|
| 621 |
-
int oldOfs = s_tileStreamCurrOfs[tileInBin];
|
| 622 |
-
int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
|
| 623 |
-
int allocLo = s_tileAllocPrefixSum[tileInBin];
|
| 624 |
-
int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
|
| 625 |
-
|
| 626 |
-
if (allocLo != allocHi)
|
| 627 |
-
{
|
| 628 |
-
S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
|
| 629 |
-
if (oldOfs < 0)
|
| 630 |
-
nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
|
| 631 |
-
*nextPtr = firstAllocSeg + allocLo;
|
| 632 |
-
|
| 633 |
-
newOfs--;
|
| 634 |
-
newOfs &= CR_TILE_SEG_SIZE - 1;
|
| 635 |
-
newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
|
| 636 |
-
newOfs++;
|
| 637 |
-
}
|
| 638 |
-
s_tileStreamCurrOfs[tileInBin] = newOfs;
|
| 639 |
-
}
|
| 640 |
-
|
| 641 |
-
// Advance queue read pointer.
|
| 642 |
-
// Queue became empty => bin done.
|
| 643 |
-
|
| 644 |
-
triQueueReadPos += CR_COARSE_WARPS * 32;
|
| 645 |
-
}
|
| 646 |
-
while (triQueueReadPos < triQueueWritePos);
|
| 647 |
-
|
| 648 |
-
// Tile per thread: Fix next-pointer and count of the last segment.
|
| 649 |
-
// 32 tiles per warp: Count active tiles.
|
| 650 |
-
|
| 651 |
-
__syncthreads();
|
| 652 |
-
|
| 653 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
| 654 |
-
{
|
| 655 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
| 656 |
-
bool act = (tileInBin < CR_BIN_SQR);
|
| 657 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 658 |
-
if (act)
|
| 659 |
-
{
|
| 660 |
-
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
| 661 |
-
int tileY = tileInBin >> CR_BIN_LOG2;
|
| 662 |
-
bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
|
| 663 |
-
|
| 664 |
-
int ofs = s_tileStreamCurrOfs[tileInBin];
|
| 665 |
-
int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
|
| 666 |
-
int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
|
| 667 |
-
|
| 668 |
-
if (ofs >= 0)
|
| 669 |
-
tileSegNext[segIdx] = -1;
|
| 670 |
-
else if (force)
|
| 671 |
-
{
|
| 672 |
-
s_tileStreamCurrOfs[tileInBin] = 0;
|
| 673 |
-
tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
|
| 674 |
-
}
|
| 675 |
-
|
| 676 |
-
if (segCount != 0)
|
| 677 |
-
tileSegCount[segIdx] = segCount;
|
| 678 |
-
|
| 679 |
-
U32 res = __ballot_sync(actMask, ofs >= 0 | force);
|
| 680 |
-
if (threadIdx.x == 0)
|
| 681 |
-
s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
|
| 682 |
-
}
|
| 683 |
-
}
|
| 684 |
-
|
| 685 |
-
// First warp: Scan-8.
|
| 686 |
-
// One thread: Allocate space for active tiles.
|
| 687 |
-
|
| 688 |
-
__syncthreads();
|
| 689 |
-
|
| 690 |
-
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
| 691 |
-
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
| 692 |
-
if (scan8)
|
| 693 |
-
{
|
| 694 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
| 695 |
-
U32 sum = v[0];
|
| 696 |
-
#if (CR_BIN_SQR > 1 * 32)
|
| 697 |
-
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 698 |
-
#endif
|
| 699 |
-
#if (CR_BIN_SQR > 2 * 32)
|
| 700 |
-
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 701 |
-
#endif
|
| 702 |
-
#if (CR_BIN_SQR > 4 * 32)
|
| 703 |
-
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
| 704 |
-
#endif
|
| 705 |
-
|
| 706 |
-
if (thrInBlock == CR_BIN_SQR / 32 - 1)
|
| 707 |
-
s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
|
| 708 |
-
}
|
| 709 |
-
|
| 710 |
-
// Tile per thread: Output active tiles.
|
| 711 |
-
|
| 712 |
-
__syncthreads();
|
| 713 |
-
|
| 714 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
| 715 |
-
{
|
| 716 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
| 717 |
-
bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
|
| 718 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 719 |
-
if (act)
|
| 720 |
-
{
|
| 721 |
-
int activeIdx = s_firstActiveIdx;
|
| 722 |
-
activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
|
| 723 |
-
activeIdx += __popc(actMask & getLaneMaskLt());
|
| 724 |
-
activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
|
| 725 |
-
}
|
| 726 |
-
}
|
| 727 |
-
}
|
| 728 |
-
}
|
| 729 |
-
|
| 730 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp
DELETED
|
@@ -1,73 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
|
| 13 |
-
#define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
|
| 14 |
-
#define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
|
| 15 |
-
|
| 16 |
-
#define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
|
| 17 |
-
#define CR_BIN_LOG2 4 // BinSize / TileSize.
|
| 18 |
-
#define CR_TILE_LOG2 3 // TileSize / PixelSize.
|
| 19 |
-
|
| 20 |
-
#define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
|
| 21 |
-
#define CR_FLIPBIT_FLIP_Y 2
|
| 22 |
-
#define CR_FLIPBIT_FLIP_X 3
|
| 23 |
-
#define CR_FLIPBIT_SWAP_XY 4
|
| 24 |
-
#define CR_FLIPBIT_COMPL 5
|
| 25 |
-
|
| 26 |
-
#define CR_BIN_STREAMS_LOG2 4
|
| 27 |
-
#define CR_BIN_SEG_LOG2 9 // 32-bit entries.
|
| 28 |
-
#define CR_TILE_SEG_LOG2 5 // 32-bit entries.
|
| 29 |
-
|
| 30 |
-
#define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
|
| 31 |
-
#define CR_COARSE_QUEUE_LOG2 10 // Triangles.
|
| 32 |
-
|
| 33 |
-
#define CR_SETUP_WARPS 2
|
| 34 |
-
#define CR_SETUP_OPT_BLOCKS 8
|
| 35 |
-
#define CR_BIN_WARPS 16
|
| 36 |
-
#define CR_COARSE_WARPS 16 // Must be a power of two.
|
| 37 |
-
#define CR_FINE_MAX_WARPS 20
|
| 38 |
-
|
| 39 |
-
#define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
|
| 40 |
-
|
| 41 |
-
//------------------------------------------------------------------------
|
| 42 |
-
|
| 43 |
-
#define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
|
| 44 |
-
#define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
|
| 45 |
-
#define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
|
| 46 |
-
|
| 47 |
-
#define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
|
| 48 |
-
#define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
|
| 49 |
-
#define CR_BIN_SIZE (1 << CR_BIN_LOG2)
|
| 50 |
-
#define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
|
| 51 |
-
|
| 52 |
-
#define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
|
| 53 |
-
#define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
|
| 54 |
-
#define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
|
| 55 |
-
#define CR_TILE_SIZE (1 << CR_TILE_LOG2)
|
| 56 |
-
#define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
|
| 57 |
-
|
| 58 |
-
#define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
|
| 59 |
-
#define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
|
| 60 |
-
#define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
|
| 61 |
-
|
| 62 |
-
#define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
|
| 63 |
-
#define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
|
| 64 |
-
|
| 65 |
-
//------------------------------------------------------------------------
|
| 66 |
-
// When evaluating interpolated Z pixel centers, we may introduce an error
|
| 67 |
-
// of (+-CR_LERP_ERROR) ULPs.
|
| 68 |
-
|
| 69 |
-
#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
|
| 70 |
-
#define CR_DEPTH_MIN CR_LERP_ERROR(3)
|
| 71 |
-
#define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
|
| 72 |
-
|
| 73 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
DELETED
|
@@ -1,79 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "Defs.hpp"
|
| 10 |
-
#include "../CudaRaster.hpp"
|
| 11 |
-
#include "RasterImpl.hpp"
|
| 12 |
-
|
| 13 |
-
using namespace CR;
|
| 14 |
-
|
| 15 |
-
//------------------------------------------------------------------------
|
| 16 |
-
// Stub interface implementation.
|
| 17 |
-
//------------------------------------------------------------------------
|
| 18 |
-
|
| 19 |
-
CudaRaster::CudaRaster()
|
| 20 |
-
{
|
| 21 |
-
m_impl = new RasterImpl();
|
| 22 |
-
}
|
| 23 |
-
|
| 24 |
-
CudaRaster::~CudaRaster()
|
| 25 |
-
{
|
| 26 |
-
delete m_impl;
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
void CudaRaster::setBufferSize(int width, int height, int numImages)
|
| 30 |
-
{
|
| 31 |
-
m_impl->setBufferSize(Vec3i(width, height, numImages));
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
|
| 35 |
-
{
|
| 36 |
-
m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
void CudaRaster::setRenderModeFlags(U32 flags)
|
| 40 |
-
{
|
| 41 |
-
m_impl->setRenderModeFlags(flags);
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
void CudaRaster::deferredClear(U32 clearColor)
|
| 45 |
-
{
|
| 46 |
-
m_impl->deferredClear(clearColor);
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
|
| 50 |
-
{
|
| 51 |
-
m_impl->setVertexBuffer(vertices, numVertices);
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
|
| 55 |
-
{
|
| 56 |
-
m_impl->setIndexBuffer(indices, numTriangles);
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
|
| 60 |
-
{
|
| 61 |
-
return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
void* CudaRaster::getColorBuffer(void)
|
| 65 |
-
{
|
| 66 |
-
return m_impl->getColorBuffer();
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
void* CudaRaster::getDepthBuffer(void)
|
| 70 |
-
{
|
| 71 |
-
return m_impl->getDepthBuffer();
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
void CudaRaster::swapDepthAndPeel(void)
|
| 75 |
-
{
|
| 76 |
-
m_impl->swapDepthAndPeel();
|
| 77 |
-
}
|
| 78 |
-
|
| 79 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp
DELETED
|
@@ -1,90 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include <cuda_runtime.h>
|
| 11 |
-
#include <cstdint>
|
| 12 |
-
|
| 13 |
-
namespace CR
|
| 14 |
-
{
|
| 15 |
-
//------------------------------------------------------------------------
|
| 16 |
-
|
| 17 |
-
#ifndef NULL
|
| 18 |
-
# define NULL 0
|
| 19 |
-
#endif
|
| 20 |
-
|
| 21 |
-
#ifdef __CUDACC__
|
| 22 |
-
# define CR_CUDA 1
|
| 23 |
-
#else
|
| 24 |
-
# define CR_CUDA 0
|
| 25 |
-
#endif
|
| 26 |
-
|
| 27 |
-
#if CR_CUDA
|
| 28 |
-
# define CR_CUDA_FUNC __device__ __inline__
|
| 29 |
-
# define CR_CUDA_CONST __constant__
|
| 30 |
-
#else
|
| 31 |
-
# define CR_CUDA_FUNC inline
|
| 32 |
-
# define CR_CUDA_CONST static const
|
| 33 |
-
#endif
|
| 34 |
-
|
| 35 |
-
#define CR_UNREF(X) ((void)(X))
|
| 36 |
-
#define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
|
| 37 |
-
|
| 38 |
-
//------------------------------------------------------------------------
|
| 39 |
-
|
| 40 |
-
typedef uint8_t U8;
|
| 41 |
-
typedef uint16_t U16;
|
| 42 |
-
typedef uint32_t U32;
|
| 43 |
-
typedef uint64_t U64;
|
| 44 |
-
typedef int8_t S8;
|
| 45 |
-
typedef int16_t S16;
|
| 46 |
-
typedef int32_t S32;
|
| 47 |
-
typedef int64_t S64;
|
| 48 |
-
typedef float F32;
|
| 49 |
-
typedef double F64;
|
| 50 |
-
typedef void (*FuncPtr)(void);
|
| 51 |
-
|
| 52 |
-
//------------------------------------------------------------------------
|
| 53 |
-
|
| 54 |
-
#define CR_U32_MAX (0xFFFFFFFFu)
|
| 55 |
-
#define CR_S32_MIN (~0x7FFFFFFF)
|
| 56 |
-
#define CR_S32_MAX (0x7FFFFFFF)
|
| 57 |
-
#define CR_U64_MAX ((U64)(S64)-1)
|
| 58 |
-
#define CR_S64_MIN ((S64)-1 << 63)
|
| 59 |
-
#define CR_S64_MAX (~((S64)-1 << 63))
|
| 60 |
-
#define CR_F32_MIN (1.175494351e-38f)
|
| 61 |
-
#define CR_F32_MAX (3.402823466e+38f)
|
| 62 |
-
#define CR_F64_MIN (2.2250738585072014e-308)
|
| 63 |
-
#define CR_F64_MAX (1.7976931348623158e+308)
|
| 64 |
-
|
| 65 |
-
//------------------------------------------------------------------------
|
| 66 |
-
// Misc types.
|
| 67 |
-
|
| 68 |
-
class Vec2i
|
| 69 |
-
{
|
| 70 |
-
public:
|
| 71 |
-
Vec2i(int x_, int y_) : x(x_), y(y_) {}
|
| 72 |
-
int x, y;
|
| 73 |
-
};
|
| 74 |
-
|
| 75 |
-
class Vec3i
|
| 76 |
-
{
|
| 77 |
-
public:
|
| 78 |
-
Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
| 79 |
-
int x, y, z;
|
| 80 |
-
};
|
| 81 |
-
|
| 82 |
-
//------------------------------------------------------------------------
|
| 83 |
-
// CUDA utilities.
|
| 84 |
-
|
| 85 |
-
#if CR_CUDA
|
| 86 |
-
# define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
|
| 87 |
-
#endif
|
| 88 |
-
|
| 89 |
-
//------------------------------------------------------------------------
|
| 90 |
-
} // namespace CR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl
DELETED
|
@@ -1,385 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
//------------------------------------------------------------------------
|
| 10 |
-
// Utility funcs.
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
|
| 13 |
-
__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
|
| 14 |
-
{
|
| 15 |
-
tileZMax = CR_DEPTH_MAX;
|
| 16 |
-
tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
|
| 20 |
-
{
|
| 21 |
-
// Entry is warp-coherent.
|
| 22 |
-
if (__any_sync(~0u, tileZUpd))
|
| 23 |
-
{
|
| 24 |
-
U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
|
| 25 |
-
temp[threadIdx.x + 16] = z; __syncwarp();
|
| 26 |
-
z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
| 27 |
-
z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
| 28 |
-
z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
| 29 |
-
z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
| 30 |
-
z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
| 31 |
-
tileZMax = temp[47];
|
| 32 |
-
tileZUpd = false;
|
| 33 |
-
}
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
//------------------------------------------------------------------------
|
| 37 |
-
|
| 38 |
-
__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
|
| 39 |
-
{
|
| 40 |
-
const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
|
| 41 |
-
const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
|
| 42 |
-
const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
|
| 43 |
-
const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
|
| 44 |
-
|
| 45 |
-
if (threadIdx.x >= tileSegCount[segment])
|
| 46 |
-
{
|
| 47 |
-
triIdx = -1;
|
| 48 |
-
dataIdx = -1;
|
| 49 |
-
}
|
| 50 |
-
else
|
| 51 |
-
{
|
| 52 |
-
int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
|
| 53 |
-
triIdx = subtriIdx >> 3;
|
| 54 |
-
dataIdx = triIdx;
|
| 55 |
-
subtriIdx &= 7;
|
| 56 |
-
if (subtriIdx != 7)
|
| 57 |
-
dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
|
| 58 |
-
triHeader = *((uint4*)triHeaderPtr + dataIdx);
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
// advance to next segment
|
| 62 |
-
segment = tileSegNext[segment];
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
//------------------------------------------------------------------------
|
| 66 |
-
|
| 67 |
-
__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
|
| 68 |
-
{
|
| 69 |
-
U32 zmin = triHeader.w & 0xFFFFF000;
|
| 70 |
-
return (zmin > tileZMax);
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
//------------------------------------------------------------------------
|
| 74 |
-
|
| 75 |
-
__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
|
| 76 |
-
{
|
| 77 |
-
int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
|
| 78 |
-
int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
|
| 79 |
-
|
| 80 |
-
// extract S16 vertex positions while subtracting tile coordinates
|
| 81 |
-
S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
|
| 82 |
-
S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
|
| 83 |
-
S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
|
| 84 |
-
S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
|
| 85 |
-
S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
|
| 86 |
-
S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
|
| 87 |
-
|
| 88 |
-
// extract flipbits
|
| 89 |
-
U32 f01 = (triHeader.w >> 6) & 0x3C;
|
| 90 |
-
U32 f12 = (triHeader.w >> 2) & 0x3C;
|
| 91 |
-
U32 f20 = (triHeader.w << 2) & 0x3C;
|
| 92 |
-
|
| 93 |
-
// compute per-edge coverage masks
|
| 94 |
-
U64 c01, c12, c20;
|
| 95 |
-
c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
|
| 96 |
-
c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
|
| 97 |
-
c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
|
| 98 |
-
|
| 99 |
-
// combine masks
|
| 100 |
-
return c01 & c12 & c20;
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
//------------------------------------------------------------------------
|
| 104 |
-
|
| 105 |
-
__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
|
| 106 |
-
{
|
| 107 |
-
__syncwarp();
|
| 108 |
-
temp[threadIdx.x + 16] = value; __syncwarp();
|
| 109 |
-
value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
| 110 |
-
value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
| 111 |
-
value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
| 112 |
-
value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
| 113 |
-
value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
| 114 |
-
return value;
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
|
| 118 |
-
{
|
| 119 |
-
return temp[47];
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
//------------------------------------------------------------------------
|
| 123 |
-
|
| 124 |
-
__device__ __inline__ S32 findBit(U64 mask, int idx)
|
| 125 |
-
{
|
| 126 |
-
U32 x = getLo(mask);
|
| 127 |
-
int pop = __popc(x);
|
| 128 |
-
bool p = (pop <= idx);
|
| 129 |
-
if (p) x = getHi(mask);
|
| 130 |
-
if (p) idx -= pop;
|
| 131 |
-
int bit = p ? 32 : 0;
|
| 132 |
-
|
| 133 |
-
pop = __popc(x & 0x0000ffffu);
|
| 134 |
-
p = (pop <= idx);
|
| 135 |
-
if (p) x >>= 16;
|
| 136 |
-
if (p) bit += 16;
|
| 137 |
-
if (p) idx -= pop;
|
| 138 |
-
|
| 139 |
-
U32 tmp = x & 0x000000ffu;
|
| 140 |
-
pop = __popc(tmp);
|
| 141 |
-
p = (pop <= idx);
|
| 142 |
-
if (p) tmp = x & 0x0000ff00u;
|
| 143 |
-
if (p) idx -= pop;
|
| 144 |
-
|
| 145 |
-
return findLeadingOne(tmp) + bit - idx;
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
//------------------------------------------------------------------------
|
| 149 |
-
// Single-sample implementation.
|
| 150 |
-
//------------------------------------------------------------------------
|
| 151 |
-
|
| 152 |
-
__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
|
| 153 |
-
{
|
| 154 |
-
atomicMin((U32*)pDepth, depth);
|
| 155 |
-
__syncwarp(ropMask);
|
| 156 |
-
bool act = (depth == *pDepth);
|
| 157 |
-
__syncwarp(ropMask);
|
| 158 |
-
U32 actMask = __ballot_sync(ropMask, act);
|
| 159 |
-
if (act)
|
| 160 |
-
{
|
| 161 |
-
*pDepth = 0;
|
| 162 |
-
__syncwarp(actMask);
|
| 163 |
-
atomicMax((U32*)pDepth, threadIdx.x);
|
| 164 |
-
__syncwarp(actMask);
|
| 165 |
-
if (*pDepth == threadIdx.x)
|
| 166 |
-
{
|
| 167 |
-
*pDepth = depth;
|
| 168 |
-
*pColor = color;
|
| 169 |
-
}
|
| 170 |
-
__syncwarp(actMask);
|
| 171 |
-
}
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
//------------------------------------------------------------------------
|
| 175 |
-
|
| 176 |
-
__device__ __inline__ void fineRasterImpl(const CRParams p)
|
| 177 |
-
{
|
| 178 |
-
// for 20 warps:
|
| 179 |
-
__shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
|
| 180 |
-
__shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
| 181 |
-
__shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
| 182 |
-
__shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
| 183 |
-
__shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
|
| 184 |
-
__shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
|
| 185 |
-
__shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
|
| 186 |
-
__shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
|
| 187 |
-
// = 47.25KB total
|
| 188 |
-
|
| 189 |
-
CRAtomics& atomics = p.atomics[blockIdx.z];
|
| 190 |
-
const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
|
| 191 |
-
|
| 192 |
-
const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
|
| 193 |
-
const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
|
| 194 |
-
|
| 195 |
-
volatile U32* tileColor = s_tileColor[threadIdx.y];
|
| 196 |
-
volatile U32* tileDepth = s_tileDepth[threadIdx.y];
|
| 197 |
-
volatile U32* tilePeel = s_tilePeel[threadIdx.y];
|
| 198 |
-
volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
|
| 199 |
-
volatile U64* triangleCov = s_triangleCov[threadIdx.y];
|
| 200 |
-
volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
|
| 201 |
-
volatile U32* temp = s_temp[threadIdx.y];
|
| 202 |
-
|
| 203 |
-
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
|
| 204 |
-
return;
|
| 205 |
-
|
| 206 |
-
temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
|
| 207 |
-
cover8x8_setupLUT(s_cover8x8_lut);
|
| 208 |
-
__syncthreads();
|
| 209 |
-
|
| 210 |
-
// loop over tiles
|
| 211 |
-
for (;;)
|
| 212 |
-
{
|
| 213 |
-
// pick a tile
|
| 214 |
-
if (threadIdx.x == 0)
|
| 215 |
-
temp[16] = atomicAdd(&atomics.fineCounter, 1);
|
| 216 |
-
__syncwarp();
|
| 217 |
-
int activeIdx = temp[16];
|
| 218 |
-
if (activeIdx >= atomics.numActiveTiles)
|
| 219 |
-
break;
|
| 220 |
-
|
| 221 |
-
int tileIdx = activeTiles[activeIdx];
|
| 222 |
-
S32 segment = tileFirstSeg[tileIdx];
|
| 223 |
-
int tileY = tileIdx / p.widthTiles;
|
| 224 |
-
int tileX = tileIdx - tileY * p.widthTiles;
|
| 225 |
-
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
|
| 226 |
-
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
|
| 227 |
-
|
| 228 |
-
// initialize per-tile state
|
| 229 |
-
int triRead = 0, triWrite = 0;
|
| 230 |
-
int fragRead = 0, fragWrite = 0;
|
| 231 |
-
if (threadIdx.x == 0)
|
| 232 |
-
triangleFrag[63] = 0; // "previous triangle"
|
| 233 |
-
|
| 234 |
-
// deferred clear => clear tile
|
| 235 |
-
if (p.deferredClear)
|
| 236 |
-
{
|
| 237 |
-
tileColor[threadIdx.x] = p.clearColor;
|
| 238 |
-
tileDepth[threadIdx.x] = p.clearDepth;
|
| 239 |
-
tileColor[threadIdx.x + 32] = p.clearColor;
|
| 240 |
-
tileDepth[threadIdx.x + 32] = p.clearDepth;
|
| 241 |
-
}
|
| 242 |
-
else // otherwise => read tile from framebuffer
|
| 243 |
-
{
|
| 244 |
-
U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
|
| 245 |
-
U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
|
| 246 |
-
tileColor[threadIdx.x] = pColor[px + p.strideX * py];
|
| 247 |
-
tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
|
| 248 |
-
tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
|
| 249 |
-
tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
|
| 250 |
-
}
|
| 251 |
-
|
| 252 |
-
// read peeling inputs if enabled
|
| 253 |
-
if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
|
| 254 |
-
{
|
| 255 |
-
U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
|
| 256 |
-
tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
|
| 257 |
-
tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
|
| 258 |
-
}
|
| 259 |
-
|
| 260 |
-
U32 tileZMax;
|
| 261 |
-
bool tileZUpd;
|
| 262 |
-
initTileZMax(tileZMax, tileZUpd, tileDepth);
|
| 263 |
-
|
| 264 |
-
// process fragments
|
| 265 |
-
for(;;)
|
| 266 |
-
{
|
| 267 |
-
// need to queue more fragments?
|
| 268 |
-
if (fragWrite - fragRead < 32 && segment >= 0)
|
| 269 |
-
{
|
| 270 |
-
// update tile z - coherent over warp
|
| 271 |
-
updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
|
| 272 |
-
|
| 273 |
-
// read triangles
|
| 274 |
-
do
|
| 275 |
-
{
|
| 276 |
-
// read triangle index and data, advance to next segment
|
| 277 |
-
S32 triIdx, dataIdx;
|
| 278 |
-
uint4 triHeader;
|
| 279 |
-
getTriangle(p, triIdx, dataIdx, triHeader, segment);
|
| 280 |
-
|
| 281 |
-
// early z cull
|
| 282 |
-
if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
|
| 283 |
-
triIdx = -1;
|
| 284 |
-
|
| 285 |
-
// determine coverage
|
| 286 |
-
U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
|
| 287 |
-
S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
|
| 288 |
-
|
| 289 |
-
// fragment count scan
|
| 290 |
-
U32 frag = scan32_value(pop, temp);
|
| 291 |
-
frag += fragWrite; // frag now holds cumulative fragment count
|
| 292 |
-
fragWrite += scan32_total(temp);
|
| 293 |
-
|
| 294 |
-
// queue non-empty triangles
|
| 295 |
-
U32 goodMask = __ballot_sync(~0u, pop != 0);
|
| 296 |
-
if (pop != 0)
|
| 297 |
-
{
|
| 298 |
-
int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
|
| 299 |
-
triDataIdx [idx] = dataIdx;
|
| 300 |
-
triangleFrag[idx] = frag;
|
| 301 |
-
triangleCov [idx] = coverage;
|
| 302 |
-
}
|
| 303 |
-
triWrite += __popc(goodMask);
|
| 304 |
-
}
|
| 305 |
-
while (fragWrite - fragRead < 32 && segment >= 0);
|
| 306 |
-
}
|
| 307 |
-
__syncwarp();
|
| 308 |
-
|
| 309 |
-
// end of segment?
|
| 310 |
-
if (fragRead == fragWrite)
|
| 311 |
-
break;
|
| 312 |
-
|
| 313 |
-
// clear triangle boundaries
|
| 314 |
-
temp[threadIdx.x + 16] = 0;
|
| 315 |
-
__syncwarp();
|
| 316 |
-
|
| 317 |
-
// tag triangle boundaries
|
| 318 |
-
if (triRead + threadIdx.x < triWrite)
|
| 319 |
-
{
|
| 320 |
-
int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
|
| 321 |
-
if (idx <= 32)
|
| 322 |
-
temp[idx + 16 - 1] = 1;
|
| 323 |
-
}
|
| 324 |
-
__syncwarp();
|
| 325 |
-
|
| 326 |
-
int ropLaneIdx = threadIdx.x;
|
| 327 |
-
U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
|
| 328 |
-
|
| 329 |
-
// distribute fragments
|
| 330 |
-
bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
|
| 331 |
-
U32 fragmentMask = __ballot_sync(~0u, hasFragment);
|
| 332 |
-
if (hasFragment)
|
| 333 |
-
{
|
| 334 |
-
int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
|
| 335 |
-
int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
|
| 336 |
-
U64 coverage = triangleCov[triBufIdx];
|
| 337 |
-
int pixelInTile = findBit(coverage, fragIdx);
|
| 338 |
-
int dataIdx = triDataIdx[triBufIdx];
|
| 339 |
-
|
| 340 |
-
// determine pixel position
|
| 341 |
-
U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
|
| 342 |
-
U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
|
| 343 |
-
|
| 344 |
-
// depth test
|
| 345 |
-
U32 depth = 0;
|
| 346 |
-
uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
|
| 347 |
-
|
| 348 |
-
depth = td.x * pixelX + td.y * pixelY + td.z;
|
| 349 |
-
bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
|
| 350 |
-
if (!zkill)
|
| 351 |
-
{
|
| 352 |
-
U32 oldDepth = tileDepth[pixelInTile];
|
| 353 |
-
if (depth > oldDepth)
|
| 354 |
-
zkill = true;
|
| 355 |
-
else if (oldDepth == tileZMax)
|
| 356 |
-
tileZUpd = true; // we are replacing previous zmax => need to update
|
| 357 |
-
}
|
| 358 |
-
|
| 359 |
-
U32 ropMask = __ballot_sync(fragmentMask, !zkill);
|
| 360 |
-
if (!zkill)
|
| 361 |
-
executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
|
| 362 |
-
}
|
| 363 |
-
// no need to sync, as next up is updateTileZMax that does internal warp sync
|
| 364 |
-
|
| 365 |
-
// update counters
|
| 366 |
-
fragRead = ::min(fragRead + 32, fragWrite);
|
| 367 |
-
triRead += __popc(boundaryMask);
|
| 368 |
-
}
|
| 369 |
-
|
| 370 |
-
// Write tile back to the framebuffer.
|
| 371 |
-
if (true)
|
| 372 |
-
{
|
| 373 |
-
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
|
| 374 |
-
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
|
| 375 |
-
U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
|
| 376 |
-
U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
|
| 377 |
-
pColor[px + p.strideX * py] = tileColor[threadIdx.x];
|
| 378 |
-
pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
|
| 379 |
-
pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
|
| 380 |
-
pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
|
| 381 |
-
}
|
| 382 |
-
}
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
DELETED
|
@@ -1,153 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include "Defs.hpp"
|
| 11 |
-
#include "Constants.hpp"
|
| 12 |
-
|
| 13 |
-
namespace CR
|
| 14 |
-
{
|
| 15 |
-
//------------------------------------------------------------------------
|
| 16 |
-
// Projected triangle.
|
| 17 |
-
//------------------------------------------------------------------------
|
| 18 |
-
|
| 19 |
-
struct CRTriangleHeader
|
| 20 |
-
{
|
| 21 |
-
S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
|
| 22 |
-
S16 v0y;
|
| 23 |
-
S16 v1x;
|
| 24 |
-
S16 v1y;
|
| 25 |
-
S16 v2x;
|
| 26 |
-
S16 v2y;
|
| 27 |
-
|
| 28 |
-
U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
|
| 29 |
-
};
|
| 30 |
-
|
| 31 |
-
//------------------------------------------------------------------------
|
| 32 |
-
|
| 33 |
-
struct CRTriangleData
|
| 34 |
-
{
|
| 35 |
-
U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
|
| 36 |
-
U32 zy;
|
| 37 |
-
U32 zb;
|
| 38 |
-
U32 id; // Triangle id.
|
| 39 |
-
};
|
| 40 |
-
|
| 41 |
-
//------------------------------------------------------------------------
|
| 42 |
-
// Device-side structures.
|
| 43 |
-
//------------------------------------------------------------------------
|
| 44 |
-
|
| 45 |
-
struct CRAtomics
|
| 46 |
-
{
|
| 47 |
-
// Setup.
|
| 48 |
-
S32 numSubtris; // = numTris
|
| 49 |
-
|
| 50 |
-
// Bin.
|
| 51 |
-
S32 binCounter; // = 0
|
| 52 |
-
S32 numBinSegs; // = 0
|
| 53 |
-
|
| 54 |
-
// Coarse.
|
| 55 |
-
S32 coarseCounter; // = 0
|
| 56 |
-
S32 numTileSegs; // = 0
|
| 57 |
-
S32 numActiveTiles; // = 0
|
| 58 |
-
|
| 59 |
-
// Fine.
|
| 60 |
-
S32 fineCounter; // = 0
|
| 61 |
-
};
|
| 62 |
-
|
| 63 |
-
//------------------------------------------------------------------------
|
| 64 |
-
|
| 65 |
-
struct CRImageParams
|
| 66 |
-
{
|
| 67 |
-
S32 triOffset; // First triangle index to draw.
|
| 68 |
-
S32 triCount; // Number of triangles to draw.
|
| 69 |
-
S32 binBatchSize; // Number of triangles per batch.
|
| 70 |
-
};
|
| 71 |
-
|
| 72 |
-
//------------------------------------------------------------------------
|
| 73 |
-
|
| 74 |
-
struct CRParams
|
| 75 |
-
{
|
| 76 |
-
// Common.
|
| 77 |
-
|
| 78 |
-
CRAtomics* atomics; // Work counters. Per-image.
|
| 79 |
-
S32 numImages; // Batch size.
|
| 80 |
-
S32 totalCount; // In range mode, total number of triangles to render.
|
| 81 |
-
S32 instanceMode; // 0 = range mode, 1 = instance mode.
|
| 82 |
-
|
| 83 |
-
S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
|
| 84 |
-
S32 numTriangles; // Number of triangles in input buffer.
|
| 85 |
-
void* vertexBuffer; // numVertices * float4(x, y, z, w)
|
| 86 |
-
void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
|
| 87 |
-
|
| 88 |
-
S32 widthPixels; // Render buffer size in pixels. Must be multiple of tile size (8x8).
|
| 89 |
-
S32 heightPixels;
|
| 90 |
-
S32 widthPixelsVp; // Viewport size in pixels.
|
| 91 |
-
S32 heightPixelsVp;
|
| 92 |
-
S32 widthBins; // widthPixels / CR_BIN_SIZE
|
| 93 |
-
S32 heightBins; // heightPixels / CR_BIN_SIZE
|
| 94 |
-
S32 numBins; // widthBins * heightBins
|
| 95 |
-
|
| 96 |
-
F32 xs; // Vertex position adjustments for tiled rendering.
|
| 97 |
-
F32 ys;
|
| 98 |
-
F32 xo;
|
| 99 |
-
F32 yo;
|
| 100 |
-
|
| 101 |
-
S32 widthTiles; // widthPixels / CR_TILE_SIZE
|
| 102 |
-
S32 heightTiles; // heightPixels / CR_TILE_SIZE
|
| 103 |
-
S32 numTiles; // widthTiles * heightTiles
|
| 104 |
-
|
| 105 |
-
U32 renderModeFlags;
|
| 106 |
-
S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
|
| 107 |
-
U32 clearColor;
|
| 108 |
-
U32 clearDepth;
|
| 109 |
-
|
| 110 |
-
// These are uniform across batch.
|
| 111 |
-
|
| 112 |
-
S32 maxSubtris;
|
| 113 |
-
S32 maxBinSegs;
|
| 114 |
-
S32 maxTileSegs;
|
| 115 |
-
|
| 116 |
-
// Setup output / bin input.
|
| 117 |
-
|
| 118 |
-
void* triSubtris; // maxSubtris * U8
|
| 119 |
-
void* triHeader; // maxSubtris * CRTriangleHeader
|
| 120 |
-
void* triData; // maxSubtris * CRTriangleData
|
| 121 |
-
|
| 122 |
-
// Bin output / coarse input.
|
| 123 |
-
|
| 124 |
-
void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
|
| 125 |
-
void* binSegNext; // maxBinSegs * S32
|
| 126 |
-
void* binSegCount; // maxBinSegs * S32
|
| 127 |
-
void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
|
| 128 |
-
void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
|
| 129 |
-
|
| 130 |
-
// Coarse output / fine input.
|
| 131 |
-
|
| 132 |
-
void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
|
| 133 |
-
void* tileSegNext; // maxTileSegs * S32
|
| 134 |
-
void* tileSegCount; // maxTileSegs * S32
|
| 135 |
-
void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
|
| 136 |
-
void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
|
| 137 |
-
|
| 138 |
-
// Surface buffers. Outer tile offset is baked into pointers.
|
| 139 |
-
|
| 140 |
-
void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
|
| 141 |
-
void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
|
| 142 |
-
void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
|
| 143 |
-
S32 strideX; // horizontal size in pixels
|
| 144 |
-
S32 strideY; // vertical stride in pixels
|
| 145 |
-
|
| 146 |
-
// Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
|
| 147 |
-
|
| 148 |
-
CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
|
| 149 |
-
const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
|
| 150 |
-
};
|
| 151 |
-
|
| 152 |
-
//------------------------------------------------------------------------
|
| 153 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
DELETED
|
@@ -1,370 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "../../framework.h"
|
| 10 |
-
#include "PrivateDefs.hpp"
|
| 11 |
-
#include "Constants.hpp"
|
| 12 |
-
#include "RasterImpl.hpp"
|
| 13 |
-
#include <cuda_runtime.h>
|
| 14 |
-
|
| 15 |
-
using namespace CR;
|
| 16 |
-
using std::min;
|
| 17 |
-
using std::max;
|
| 18 |
-
|
| 19 |
-
//------------------------------------------------------------------------
|
| 20 |
-
// Kernel prototypes and variables.
|
| 21 |
-
|
| 22 |
-
void triangleSetupKernel (const CRParams p);
|
| 23 |
-
void binRasterKernel (const CRParams p);
|
| 24 |
-
void coarseRasterKernel (const CRParams p);
|
| 25 |
-
void fineRasterKernel (const CRParams p);
|
| 26 |
-
|
| 27 |
-
//------------------------------------------------------------------------
|
| 28 |
-
|
| 29 |
-
RasterImpl::RasterImpl(void)
|
| 30 |
-
: m_renderModeFlags (0),
|
| 31 |
-
m_deferredClear (false),
|
| 32 |
-
m_clearColor (0),
|
| 33 |
-
m_vertexPtr (NULL),
|
| 34 |
-
m_indexPtr (NULL),
|
| 35 |
-
m_numVertices (0),
|
| 36 |
-
m_numTriangles (0),
|
| 37 |
-
m_bufferSizesReported (0),
|
| 38 |
-
|
| 39 |
-
m_numImages (0),
|
| 40 |
-
m_bufferSizePixels (0, 0),
|
| 41 |
-
m_bufferSizeVp (0, 0),
|
| 42 |
-
m_sizePixels (0, 0),
|
| 43 |
-
m_sizeVp (0, 0),
|
| 44 |
-
m_offsetPixels (0, 0),
|
| 45 |
-
m_sizeBins (0, 0),
|
| 46 |
-
m_numBins (0),
|
| 47 |
-
m_sizeTiles (0, 0),
|
| 48 |
-
m_numTiles (0),
|
| 49 |
-
|
| 50 |
-
m_numSMs (1),
|
| 51 |
-
m_numCoarseBlocksPerSM (1),
|
| 52 |
-
m_numFineBlocksPerSM (1),
|
| 53 |
-
m_numFineWarpsPerBlock (1),
|
| 54 |
-
|
| 55 |
-
m_maxSubtris (1),
|
| 56 |
-
m_maxBinSegs (1),
|
| 57 |
-
m_maxTileSegs (1)
|
| 58 |
-
{
|
| 59 |
-
// Query relevant device attributes.
|
| 60 |
-
|
| 61 |
-
int currentDevice = 0;
|
| 62 |
-
NVDR_CHECK_CUDA_ERROR(cudaGetDevice(¤tDevice));
|
| 63 |
-
NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
|
| 64 |
-
cudaFuncAttributes attr;
|
| 65 |
-
NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
|
| 66 |
-
m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
|
| 67 |
-
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
|
| 68 |
-
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
|
| 69 |
-
|
| 70 |
-
// Setup functions.
|
| 71 |
-
|
| 72 |
-
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
|
| 73 |
-
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
|
| 74 |
-
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
|
| 75 |
-
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
//------------------------------------------------------------------------
|
| 79 |
-
|
| 80 |
-
RasterImpl::~RasterImpl(void)
|
| 81 |
-
{
|
| 82 |
-
// Empty.
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
//------------------------------------------------------------------------
|
| 86 |
-
|
| 87 |
-
void RasterImpl::setBufferSize(Vec3i size)
|
| 88 |
-
{
|
| 89 |
-
// Internal buffer width and height must be divisible by tile size.
|
| 90 |
-
int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
| 91 |
-
int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
| 92 |
-
|
| 93 |
-
m_bufferSizePixels = Vec2i(w, h);
|
| 94 |
-
m_bufferSizeVp = Vec2i(size.x, size.y);
|
| 95 |
-
m_numImages = size.z;
|
| 96 |
-
|
| 97 |
-
m_colorBuffer.reset(w * h * size.z * sizeof(U32));
|
| 98 |
-
m_depthBuffer.reset(w * h * size.z * sizeof(U32));
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
//------------------------------------------------------------------------
|
| 102 |
-
|
| 103 |
-
void RasterImpl::setViewport(Vec2i size, Vec2i offset)
|
| 104 |
-
{
|
| 105 |
-
// Offset must be divisible by tile size.
|
| 106 |
-
NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
|
| 107 |
-
|
| 108 |
-
// Round internal viewport size to multiples of tile size.
|
| 109 |
-
int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
| 110 |
-
int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
| 111 |
-
|
| 112 |
-
m_sizePixels = Vec2i(w, h);
|
| 113 |
-
m_offsetPixels = offset;
|
| 114 |
-
m_sizeVp = Vec2i(size.x, size.y);
|
| 115 |
-
m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
|
| 116 |
-
m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
|
| 117 |
-
m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
|
| 118 |
-
m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
|
| 119 |
-
m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
|
| 120 |
-
m_numBins = m_sizeBins.x * m_sizeBins.y;
|
| 121 |
-
}
|
| 122 |
-
|
| 123 |
-
void RasterImpl::swapDepthAndPeel(void)
|
| 124 |
-
{
|
| 125 |
-
m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
|
| 126 |
-
|
| 127 |
-
void* tmp = m_depthBuffer.getPtr();
|
| 128 |
-
m_depthBuffer.setPtr(m_peelBuffer.getPtr());
|
| 129 |
-
m_peelBuffer.setPtr(tmp);
|
| 130 |
-
}
|
| 131 |
-
|
| 132 |
-
//------------------------------------------------------------------------
|
| 133 |
-
|
| 134 |
-
bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
|
| 135 |
-
{
|
| 136 |
-
bool instanceMode = (!ranges);
|
| 137 |
-
|
| 138 |
-
int maxSubtrisSlack = 4096; // x 81B = 324KB
|
| 139 |
-
int maxBinSegsSlack = 256; // x 2137B = 534KB
|
| 140 |
-
int maxTileSegsSlack = 4096; // x 136B = 544KB
|
| 141 |
-
|
| 142 |
-
// Resize atomics as needed.
|
| 143 |
-
m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
|
| 144 |
-
m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
|
| 145 |
-
|
| 146 |
-
// Size of these buffers doesn't depend on input.
|
| 147 |
-
m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
|
| 148 |
-
m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
|
| 149 |
-
m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
|
| 150 |
-
m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
|
| 151 |
-
|
| 152 |
-
// Construct per-image parameters and determine worst-case buffer sizes.
|
| 153 |
-
m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
|
| 154 |
-
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
|
| 155 |
-
for (int i=0; i < m_numImages; i++)
|
| 156 |
-
{
|
| 157 |
-
CRImageParams& ip = imageParams[i];
|
| 158 |
-
|
| 159 |
-
int roundSize = CR_BIN_WARPS * 32;
|
| 160 |
-
int minBatches = CR_BIN_STREAMS_SIZE * 2;
|
| 161 |
-
int maxRounds = 32;
|
| 162 |
-
|
| 163 |
-
ip.triOffset = instanceMode ? 0 : ranges[i].x;
|
| 164 |
-
ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
|
| 165 |
-
ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
|
| 166 |
-
|
| 167 |
-
m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
|
| 168 |
-
m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
|
| 169 |
-
m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
// Retry until successful.
|
| 173 |
-
|
| 174 |
-
for (;;)
|
| 175 |
-
{
|
| 176 |
-
// Allocate buffers.
|
| 177 |
-
m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
|
| 178 |
-
m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
|
| 179 |
-
m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
|
| 180 |
-
|
| 181 |
-
m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
|
| 182 |
-
m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
|
| 183 |
-
m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
|
| 184 |
-
|
| 185 |
-
m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
|
| 186 |
-
m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
|
| 187 |
-
m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
|
| 188 |
-
|
| 189 |
-
// Report if buffers grow from last time.
|
| 190 |
-
size_t sizesTotal = getTotalBufferSizes();
|
| 191 |
-
if (sizesTotal > m_bufferSizesReported)
|
| 192 |
-
{
|
| 193 |
-
size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
|
| 194 |
-
sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
|
| 195 |
-
LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
|
| 196 |
-
m_bufferSizesReported = sizesMB << 20;
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
// Launch stages. Blocks until everything is done.
|
| 200 |
-
launchStages(instanceMode, peel, stream);
|
| 201 |
-
|
| 202 |
-
// Peeling iteration cannot fail, so no point checking things further.
|
| 203 |
-
if (peel)
|
| 204 |
-
break;
|
| 205 |
-
|
| 206 |
-
// Atomics after coarse stage are now available.
|
| 207 |
-
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
|
| 208 |
-
|
| 209 |
-
// Success?
|
| 210 |
-
bool failed = false;
|
| 211 |
-
for (int i=0; i < m_numImages; i++)
|
| 212 |
-
{
|
| 213 |
-
const CRAtomics& a = atomics[i];
|
| 214 |
-
failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
|
| 215 |
-
}
|
| 216 |
-
if (!failed)
|
| 217 |
-
break; // Success!
|
| 218 |
-
|
| 219 |
-
// If we were already at maximum capacity, no can do.
|
| 220 |
-
if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
|
| 221 |
-
return false;
|
| 222 |
-
|
| 223 |
-
// Enlarge buffers and try again.
|
| 224 |
-
for (int i=0; i < m_numImages; i++)
|
| 225 |
-
{
|
| 226 |
-
const CRAtomics& a = atomics[i];
|
| 227 |
-
m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
|
| 228 |
-
m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
|
| 229 |
-
m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
|
| 230 |
-
}
|
| 231 |
-
}
|
| 232 |
-
|
| 233 |
-
m_deferredClear = false;
|
| 234 |
-
return true; // Success.
|
| 235 |
-
}
|
| 236 |
-
|
| 237 |
-
//------------------------------------------------------------------------
|
| 238 |
-
|
| 239 |
-
size_t RasterImpl::getTotalBufferSizes(void) const
|
| 240 |
-
{
|
| 241 |
-
return
|
| 242 |
-
m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
|
| 243 |
-
m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
|
| 244 |
-
m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
|
| 245 |
-
m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
|
| 246 |
-
}
|
| 247 |
-
|
| 248 |
-
//------------------------------------------------------------------------
|
| 249 |
-
|
| 250 |
-
void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
|
| 251 |
-
{
|
| 252 |
-
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
|
| 253 |
-
|
| 254 |
-
// Unless peeling, initialize atomics to mostly zero.
|
| 255 |
-
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
|
| 256 |
-
if (!peel)
|
| 257 |
-
{
|
| 258 |
-
memset(atomics, 0, m_numImages * sizeof(CRAtomics));
|
| 259 |
-
for (int i=0; i < m_numImages; i++)
|
| 260 |
-
atomics[i].numSubtris = imageParams[i].triCount;
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
-
// Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
|
| 264 |
-
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
|
| 265 |
-
|
| 266 |
-
// Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
|
| 267 |
-
if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
|
| 268 |
-
{
|
| 269 |
-
int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
|
| 270 |
-
m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
|
| 271 |
-
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
|
| 272 |
-
}
|
| 273 |
-
|
| 274 |
-
// Set global parameters.
|
| 275 |
-
CRParams p;
|
| 276 |
-
{
|
| 277 |
-
p.atomics = (CRAtomics*)m_crAtomics.getPtr();
|
| 278 |
-
p.numImages = m_numImages;
|
| 279 |
-
p.totalCount = 0; // Only relevant in range mode.
|
| 280 |
-
p.instanceMode = instanceMode ? 1 : 0;
|
| 281 |
-
|
| 282 |
-
p.numVertices = m_numVertices;
|
| 283 |
-
p.numTriangles = m_numTriangles;
|
| 284 |
-
p.vertexBuffer = m_vertexPtr;
|
| 285 |
-
p.indexBuffer = m_indexPtr;
|
| 286 |
-
|
| 287 |
-
p.widthPixels = m_sizePixels.x;
|
| 288 |
-
p.heightPixels = m_sizePixels.y;
|
| 289 |
-
p.widthPixelsVp = m_sizeVp.x;
|
| 290 |
-
p.heightPixelsVp = m_sizeVp.y;
|
| 291 |
-
p.widthBins = m_sizeBins.x;
|
| 292 |
-
p.heightBins = m_sizeBins.y;
|
| 293 |
-
p.numBins = m_numBins;
|
| 294 |
-
|
| 295 |
-
p.xs = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
|
| 296 |
-
p.ys = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
|
| 297 |
-
p.xo = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
|
| 298 |
-
p.yo = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
|
| 299 |
-
|
| 300 |
-
p.widthTiles = m_sizeTiles.x;
|
| 301 |
-
p.heightTiles = m_sizeTiles.y;
|
| 302 |
-
p.numTiles = m_numTiles;
|
| 303 |
-
|
| 304 |
-
p.renderModeFlags = m_renderModeFlags;
|
| 305 |
-
p.deferredClear = m_deferredClear ? 1 : 0;
|
| 306 |
-
p.clearColor = m_clearColor;
|
| 307 |
-
p.clearDepth = CR_DEPTH_MAX;
|
| 308 |
-
|
| 309 |
-
p.maxSubtris = m_maxSubtris;
|
| 310 |
-
p.maxBinSegs = m_maxBinSegs;
|
| 311 |
-
p.maxTileSegs = m_maxTileSegs;
|
| 312 |
-
|
| 313 |
-
p.triSubtris = m_triSubtris.getPtr();
|
| 314 |
-
p.triHeader = m_triHeader.getPtr();
|
| 315 |
-
p.triData = m_triData.getPtr();
|
| 316 |
-
p.binSegData = m_binSegData.getPtr();
|
| 317 |
-
p.binSegNext = m_binSegNext.getPtr();
|
| 318 |
-
p.binSegCount = m_binSegCount.getPtr();
|
| 319 |
-
p.binFirstSeg = m_binFirstSeg.getPtr();
|
| 320 |
-
p.binTotal = m_binTotal.getPtr();
|
| 321 |
-
p.tileSegData = m_tileSegData.getPtr();
|
| 322 |
-
p.tileSegNext = m_tileSegNext.getPtr();
|
| 323 |
-
p.tileSegCount = m_tileSegCount.getPtr();
|
| 324 |
-
p.activeTiles = m_activeTiles.getPtr();
|
| 325 |
-
p.tileFirstSeg = m_tileFirstSeg.getPtr();
|
| 326 |
-
|
| 327 |
-
size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
|
| 328 |
-
p.colorBuffer = m_colorBuffer.getPtr(byteOffset);
|
| 329 |
-
p.depthBuffer = m_depthBuffer.getPtr(byteOffset);
|
| 330 |
-
p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
|
| 331 |
-
p.strideX = m_bufferSizePixels.x;
|
| 332 |
-
p.strideY = m_bufferSizePixels.y;
|
| 333 |
-
|
| 334 |
-
memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
|
| 335 |
-
p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
|
| 336 |
-
}
|
| 337 |
-
|
| 338 |
-
// Setup block sizes.
|
| 339 |
-
|
| 340 |
-
dim3 brBlock(32, CR_BIN_WARPS);
|
| 341 |
-
dim3 crBlock(32, CR_COARSE_WARPS);
|
| 342 |
-
dim3 frBlock(32, m_numFineWarpsPerBlock);
|
| 343 |
-
void* args[] = {&p};
|
| 344 |
-
|
| 345 |
-
// Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
|
| 346 |
-
if (!peel)
|
| 347 |
-
{
|
| 348 |
-
if (instanceMode)
|
| 349 |
-
{
|
| 350 |
-
int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
|
| 351 |
-
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
|
| 352 |
-
}
|
| 353 |
-
else
|
| 354 |
-
{
|
| 355 |
-
for (int i=0; i < m_numImages; i++)
|
| 356 |
-
p.totalCount += imageParams[i].triCount;
|
| 357 |
-
int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
|
| 358 |
-
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
|
| 359 |
-
}
|
| 360 |
-
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
|
| 361 |
-
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
|
| 362 |
-
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
|
| 363 |
-
}
|
| 364 |
-
|
| 365 |
-
// Fine rasterizer is launched always.
|
| 366 |
-
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
|
| 367 |
-
NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
|
| 368 |
-
}
|
| 369 |
-
|
| 370 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
#include "PrivateDefs.hpp"
|
| 11 |
-
#include "Buffer.hpp"
|
| 12 |
-
#include "../CudaRaster.hpp"
|
| 13 |
-
|
| 14 |
-
namespace CR
|
| 15 |
-
{
|
| 16 |
-
//------------------------------------------------------------------------
|
| 17 |
-
|
| 18 |
-
class RasterImpl
|
| 19 |
-
{
|
| 20 |
-
public:
|
| 21 |
-
RasterImpl (void);
|
| 22 |
-
~RasterImpl (void);
|
| 23 |
-
|
| 24 |
-
void setBufferSize (Vec3i size);
|
| 25 |
-
void setViewport (Vec2i size, Vec2i offset);
|
| 26 |
-
void setRenderModeFlags (U32 flags) { m_renderModeFlags = flags; }
|
| 27 |
-
void deferredClear (U32 color) { m_deferredClear = true; m_clearColor = color; }
|
| 28 |
-
void setVertexBuffer (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
|
| 29 |
-
void setIndexBuffer (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
|
| 30 |
-
bool drawTriangles (const Vec2i* ranges, bool peel, cudaStream_t stream);
|
| 31 |
-
void* getColorBuffer (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
|
| 32 |
-
void* getDepthBuffer (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
|
| 33 |
-
void swapDepthAndPeel (void);
|
| 34 |
-
size_t getTotalBufferSizes (void) const;
|
| 35 |
-
|
| 36 |
-
private:
|
| 37 |
-
void launchStages (bool instanceMode, bool peel, cudaStream_t stream);
|
| 38 |
-
|
| 39 |
-
// State.
|
| 40 |
-
|
| 41 |
-
unsigned int m_renderModeFlags;
|
| 42 |
-
bool m_deferredClear;
|
| 43 |
-
unsigned int m_clearColor;
|
| 44 |
-
void* m_vertexPtr;
|
| 45 |
-
void* m_indexPtr;
|
| 46 |
-
int m_numVertices; // Input buffer size.
|
| 47 |
-
int m_numTriangles; // Input buffer size.
|
| 48 |
-
size_t m_bufferSizesReported; // Previously reported buffer sizes.
|
| 49 |
-
|
| 50 |
-
// Surfaces.
|
| 51 |
-
|
| 52 |
-
Buffer m_colorBuffer;
|
| 53 |
-
Buffer m_depthBuffer;
|
| 54 |
-
Buffer m_peelBuffer;
|
| 55 |
-
int m_numImages;
|
| 56 |
-
Vec2i m_bufferSizePixels; // Internal buffer size.
|
| 57 |
-
Vec2i m_bufferSizeVp; // Total viewport size.
|
| 58 |
-
Vec2i m_sizePixels; // Internal size at which all computation is done, buffers reserved, etc.
|
| 59 |
-
Vec2i m_sizeVp; // Size to which output will be cropped outside, determines viewport size.
|
| 60 |
-
Vec2i m_offsetPixels; // Viewport offset for tiled rendering.
|
| 61 |
-
Vec2i m_sizeBins;
|
| 62 |
-
S32 m_numBins;
|
| 63 |
-
Vec2i m_sizeTiles;
|
| 64 |
-
S32 m_numTiles;
|
| 65 |
-
|
| 66 |
-
// Launch sizes etc.
|
| 67 |
-
|
| 68 |
-
S32 m_numSMs;
|
| 69 |
-
S32 m_numCoarseBlocksPerSM;
|
| 70 |
-
S32 m_numFineBlocksPerSM;
|
| 71 |
-
S32 m_numFineWarpsPerBlock;
|
| 72 |
-
|
| 73 |
-
// Global intermediate buffers. Individual images have offsets to these.
|
| 74 |
-
|
| 75 |
-
Buffer m_crAtomics;
|
| 76 |
-
HostBuffer m_crAtomicsHost;
|
| 77 |
-
HostBuffer m_crImageParamsHost;
|
| 78 |
-
Buffer m_crImageParamsExtra;
|
| 79 |
-
Buffer m_triSubtris;
|
| 80 |
-
Buffer m_triHeader;
|
| 81 |
-
Buffer m_triData;
|
| 82 |
-
Buffer m_binFirstSeg;
|
| 83 |
-
Buffer m_binTotal;
|
| 84 |
-
Buffer m_binSegData;
|
| 85 |
-
Buffer m_binSegNext;
|
| 86 |
-
Buffer m_binSegCount;
|
| 87 |
-
Buffer m_activeTiles;
|
| 88 |
-
Buffer m_tileFirstSeg;
|
| 89 |
-
Buffer m_tileSegData;
|
| 90 |
-
Buffer m_tileSegNext;
|
| 91 |
-
Buffer m_tileSegCount;
|
| 92 |
-
|
| 93 |
-
// Actual buffer sizes.
|
| 94 |
-
|
| 95 |
-
S32 m_maxSubtris;
|
| 96 |
-
S32 m_maxBinSegs;
|
| 97 |
-
S32 m_maxTileSegs;
|
| 98 |
-
};
|
| 99 |
-
|
| 100 |
-
//------------------------------------------------------------------------
|
| 101 |
-
} // namespace CR
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl_.cu
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "../CudaRaster.hpp"
|
| 10 |
-
#include "PrivateDefs.hpp"
|
| 11 |
-
#include "Constants.hpp"
|
| 12 |
-
#include "Util.inl"
|
| 13 |
-
|
| 14 |
-
namespace CR
|
| 15 |
-
{
|
| 16 |
-
|
| 17 |
-
//------------------------------------------------------------------------
|
| 18 |
-
// Stage implementations.
|
| 19 |
-
//------------------------------------------------------------------------
|
| 20 |
-
|
| 21 |
-
#include "TriangleSetup.inl"
|
| 22 |
-
#include "BinRaster.inl"
|
| 23 |
-
#include "CoarseRaster.inl"
|
| 24 |
-
#include "FineRaster.inl"
|
| 25 |
-
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
//------------------------------------------------------------------------
|
| 29 |
-
// Stage entry points.
|
| 30 |
-
//------------------------------------------------------------------------
|
| 31 |
-
|
| 32 |
-
__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS) triangleSetupKernel (const CR::CRParams p) { CR::triangleSetupImpl(p); }
|
| 33 |
-
__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1) binRasterKernel (const CR::CRParams p) { CR::binRasterImpl(p); }
|
| 34 |
-
__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1) coarseRasterKernel (const CR::CRParams p) { CR::coarseRasterImpl(p); }
|
| 35 |
-
__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1) fineRasterKernel (const CR::CRParams p) { CR::fineRasterImpl(p); }
|
| 36 |
-
|
| 37 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
DELETED
|
@@ -1,402 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
//------------------------------------------------------------------------
|
| 10 |
-
|
| 11 |
-
__device__ __inline__ void snapTriangle(
|
| 12 |
-
const CRParams& p,
|
| 13 |
-
float4 v0, float4 v1, float4 v2,
|
| 14 |
-
int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
|
| 15 |
-
{
|
| 16 |
-
F32 viewScaleX = (F32)(p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 17 |
-
F32 viewScaleY = (F32)(p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 18 |
-
rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
|
| 19 |
-
p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
|
| 20 |
-
p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
|
| 21 |
-
p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
|
| 22 |
-
lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
|
| 23 |
-
hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
//------------------------------------------------------------------------
|
| 27 |
-
|
| 28 |
-
__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
|
| 29 |
-
{
|
| 30 |
-
U32 flips = 0;
|
| 31 |
-
if (dy > 0 || (dy == 0 && dx <= 0))
|
| 32 |
-
flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
|
| 33 |
-
if (dx > 0)
|
| 34 |
-
flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
|
| 35 |
-
if (::abs(dx) < ::abs(dy))
|
| 36 |
-
flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
|
| 37 |
-
return flips;
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
//------------------------------------------------------------------------
|
| 41 |
-
|
| 42 |
-
__device__ __inline__ bool prepareTriangle(
|
| 43 |
-
const CRParams& p,
|
| 44 |
-
int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
|
| 45 |
-
int2& d1, int2& d2, S32& area)
|
| 46 |
-
{
|
| 47 |
-
// Backfacing or degenerate => cull.
|
| 48 |
-
|
| 49 |
-
d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
|
| 50 |
-
d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
|
| 51 |
-
area = d1.x * d2.y - d1.y * d2.x;
|
| 52 |
-
|
| 53 |
-
if (area == 0)
|
| 54 |
-
return false; // Degenerate.
|
| 55 |
-
|
| 56 |
-
if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
|
| 57 |
-
return false; // Backfacing.
|
| 58 |
-
|
| 59 |
-
// AABB falls between samples => cull.
|
| 60 |
-
|
| 61 |
-
int sampleSize = 1 << CR_SUBPIXEL_LOG2;
|
| 62 |
-
int biasX = (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
|
| 63 |
-
int biasY = (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
|
| 64 |
-
int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
|
| 65 |
-
int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
|
| 66 |
-
int hix = (hi.x + biasX) & -sampleSize;
|
| 67 |
-
int hiy = (hi.y + biasY) & -sampleSize;
|
| 68 |
-
|
| 69 |
-
if (lox > hix || loy > hiy)
|
| 70 |
-
return false; // Between pixels.
|
| 71 |
-
|
| 72 |
-
// AABB covers 1 or 2 samples => cull if they are not covered.
|
| 73 |
-
|
| 74 |
-
int diff = add_sub(hix, hiy, lox) - loy;
|
| 75 |
-
if (diff <= sampleSize)
|
| 76 |
-
{
|
| 77 |
-
int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
|
| 78 |
-
int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
|
| 79 |
-
int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
|
| 80 |
-
S32 e0 = t0.x * t1.y - t0.y * t1.x;
|
| 81 |
-
S32 e1 = t1.x * t2.y - t1.y * t2.x;
|
| 82 |
-
S32 e2 = t2.x * t0.y - t2.y * t0.x;
|
| 83 |
-
if (area < 0)
|
| 84 |
-
{
|
| 85 |
-
e0 = -e0;
|
| 86 |
-
e1 = -e1;
|
| 87 |
-
e2 = -e2;
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
if (e0 < 0 || e1 < 0 || e2 < 0)
|
| 91 |
-
{
|
| 92 |
-
if (diff == 0)
|
| 93 |
-
return false; // Between pixels.
|
| 94 |
-
|
| 95 |
-
t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
|
| 96 |
-
t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
|
| 97 |
-
t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
|
| 98 |
-
e0 = t0.x * t1.y - t0.y * t1.x;
|
| 99 |
-
e1 = t1.x * t2.y - t1.y * t2.x;
|
| 100 |
-
e2 = t2.x * t0.y - t2.y * t0.x;
|
| 101 |
-
if (area < 0)
|
| 102 |
-
{
|
| 103 |
-
e0 = -e0;
|
| 104 |
-
e1 = -e1;
|
| 105 |
-
e2 = -e2;
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
if (e0 < 0 || e1 < 0 || e2 < 0)
|
| 109 |
-
return false; // Between pixels.
|
| 110 |
-
}
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
// Otherwise => proceed to output the triangle.
|
| 114 |
-
|
| 115 |
-
return true; // Visible.
|
| 116 |
-
}
|
| 117 |
-
|
| 118 |
-
//------------------------------------------------------------------------
|
| 119 |
-
|
| 120 |
-
__device__ __inline__ void setupTriangle(
|
| 121 |
-
const CRParams& p,
|
| 122 |
-
CRTriangleHeader* th, CRTriangleData* td, int triId,
|
| 123 |
-
float v0z, float v1z, float v2z,
|
| 124 |
-
int2 p0, int2 p1, int2 p2, float3 rcpW,
|
| 125 |
-
int2 d1, int2 d2, S32 area)
|
| 126 |
-
{
|
| 127 |
-
// Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
|
| 128 |
-
// disabled (if it is enabled, we never come here with area < 0).
|
| 129 |
-
|
| 130 |
-
if (area < 0)
|
| 131 |
-
{
|
| 132 |
-
swap(d1, d2);
|
| 133 |
-
swap(p1, p2);
|
| 134 |
-
swap(v1z, v2z);
|
| 135 |
-
swap(rcpW.y, rcpW.z);
|
| 136 |
-
area = -area;
|
| 137 |
-
}
|
| 138 |
-
|
| 139 |
-
int2 wv0;
|
| 140 |
-
wv0.x = p0.x + (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 141 |
-
wv0.y = p0.y + (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
| 142 |
-
|
| 143 |
-
// Setup depth plane equation.
|
| 144 |
-
|
| 145 |
-
F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
|
| 146 |
-
F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
|
| 147 |
-
float3 zvert = make_float3(
|
| 148 |
-
(v0z * zcoef) * rcpW.x + zbias,
|
| 149 |
-
(v1z * zcoef) * rcpW.y + zbias,
|
| 150 |
-
(v2z * zcoef) * rcpW.z + zbias
|
| 151 |
-
);
|
| 152 |
-
int2 zv0 = make_int2(
|
| 153 |
-
wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
|
| 154 |
-
wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
|
| 155 |
-
);
|
| 156 |
-
uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
|
| 157 |
-
|
| 158 |
-
U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
|
| 159 |
-
|
| 160 |
-
// Write CRTriangleData.
|
| 161 |
-
|
| 162 |
-
*(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
|
| 163 |
-
|
| 164 |
-
// Determine flipbits.
|
| 165 |
-
|
| 166 |
-
U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
|
| 167 |
-
U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
|
| 168 |
-
U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
|
| 169 |
-
|
| 170 |
-
// Write CRTriangleHeader.
|
| 171 |
-
|
| 172 |
-
*(uint4*)th = make_uint4(
|
| 173 |
-
prmt(p0.x, p0.y, 0x5410),
|
| 174 |
-
prmt(p1.x, p1.y, 0x5410),
|
| 175 |
-
prmt(p2.x, p2.y, 0x5410),
|
| 176 |
-
(zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
|
| 177 |
-
}
|
| 178 |
-
|
| 179 |
-
//------------------------------------------------------------------------
|
| 180 |
-
|
| 181 |
-
__device__ __inline__ void triangleSetupImpl(const CRParams p)
|
| 182 |
-
{
|
| 183 |
-
__shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
|
| 184 |
-
F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
|
| 185 |
-
|
| 186 |
-
// Compute task and image indices.
|
| 187 |
-
|
| 188 |
-
int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
|
| 189 |
-
int imageIdx = 0;
|
| 190 |
-
if (p.instanceMode)
|
| 191 |
-
{
|
| 192 |
-
imageIdx = blockIdx.z;
|
| 193 |
-
if (taskIdx >= p.numTriangles)
|
| 194 |
-
return;
|
| 195 |
-
}
|
| 196 |
-
else
|
| 197 |
-
{
|
| 198 |
-
while (imageIdx < p.numImages)
|
| 199 |
-
{
|
| 200 |
-
int count = getImageParams(p, imageIdx).triCount;
|
| 201 |
-
if (taskIdx < count)
|
| 202 |
-
break;
|
| 203 |
-
taskIdx -= count;
|
| 204 |
-
imageIdx += 1;
|
| 205 |
-
}
|
| 206 |
-
if (imageIdx == p.numImages)
|
| 207 |
-
return;
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
// Per-image data structures.
|
| 211 |
-
|
| 212 |
-
const CRImageParams& ip = getImageParams(p, imageIdx);
|
| 213 |
-
CRAtomics& atomics = p.atomics[imageIdx];
|
| 214 |
-
|
| 215 |
-
const int* indexBuffer = (const int*)p.indexBuffer;
|
| 216 |
-
U8* triSubtris = (U8*)p.triSubtris + imageIdx * p.maxSubtris;
|
| 217 |
-
CRTriangleHeader* triHeader = (CRTriangleHeader*)p.triHeader + imageIdx * p.maxSubtris;
|
| 218 |
-
CRTriangleData* triData = (CRTriangleData*)p.triData + imageIdx * p.maxSubtris;
|
| 219 |
-
|
| 220 |
-
// Determine triangle index.
|
| 221 |
-
|
| 222 |
-
int triIdx = taskIdx;
|
| 223 |
-
if (!p.instanceMode)
|
| 224 |
-
triIdx += ip.triOffset;
|
| 225 |
-
|
| 226 |
-
// Read vertex indices.
|
| 227 |
-
|
| 228 |
-
if ((U32)triIdx >= (U32)p.numTriangles)
|
| 229 |
-
{
|
| 230 |
-
// Bad triangle index.
|
| 231 |
-
triSubtris[taskIdx] = 0;
|
| 232 |
-
return;
|
| 233 |
-
}
|
| 234 |
-
|
| 235 |
-
uint4 vidx;
|
| 236 |
-
vidx.x = indexBuffer[triIdx * 3 + 0];
|
| 237 |
-
vidx.y = indexBuffer[triIdx * 3 + 1];
|
| 238 |
-
vidx.z = indexBuffer[triIdx * 3 + 2];
|
| 239 |
-
vidx.w = triIdx + 1; // Triangle index.
|
| 240 |
-
|
| 241 |
-
if (vidx.x >= (U32)p.numVertices ||
|
| 242 |
-
vidx.y >= (U32)p.numVertices ||
|
| 243 |
-
vidx.z >= (U32)p.numVertices)
|
| 244 |
-
{
|
| 245 |
-
// Bad vertex index.
|
| 246 |
-
triSubtris[taskIdx] = 0;
|
| 247 |
-
return;
|
| 248 |
-
}
|
| 249 |
-
|
| 250 |
-
// Read vertex positions.
|
| 251 |
-
|
| 252 |
-
const float4* vertexBuffer = (const float4*)p.vertexBuffer;
|
| 253 |
-
if (p.instanceMode)
|
| 254 |
-
vertexBuffer += p.numVertices * imageIdx; // Instance offset.
|
| 255 |
-
|
| 256 |
-
float4 v0 = vertexBuffer[vidx.x];
|
| 257 |
-
float4 v1 = vertexBuffer[vidx.y];
|
| 258 |
-
float4 v2 = vertexBuffer[vidx.z];
|
| 259 |
-
|
| 260 |
-
// Adjust vertex positions according to current viewport size and offset.
|
| 261 |
-
|
| 262 |
-
v0.x = v0.x * p.xs + v0.w * p.xo;
|
| 263 |
-
v0.y = v0.y * p.ys + v0.w * p.yo;
|
| 264 |
-
v1.x = v1.x * p.xs + v1.w * p.xo;
|
| 265 |
-
v1.y = v1.y * p.ys + v1.w * p.yo;
|
| 266 |
-
v2.x = v2.x * p.xs + v2.w * p.xo;
|
| 267 |
-
v2.y = v2.y * p.ys + v2.w * p.yo;
|
| 268 |
-
|
| 269 |
-
// Outside view frustum => cull.
|
| 270 |
-
|
| 271 |
-
if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
|
| 272 |
-
{
|
| 273 |
-
if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
|
| 274 |
-
(v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
|
| 275 |
-
(v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
|
| 276 |
-
(v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
|
| 277 |
-
(v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
|
| 278 |
-
(v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
|
| 279 |
-
{
|
| 280 |
-
triSubtris[taskIdx] = 0;
|
| 281 |
-
return;
|
| 282 |
-
}
|
| 283 |
-
}
|
| 284 |
-
|
| 285 |
-
// Inside depth range => try to snap vertices.
|
| 286 |
-
|
| 287 |
-
if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
|
| 288 |
-
{
|
| 289 |
-
// Inside S16 range and small enough => fast path.
|
| 290 |
-
// Note: aabbLimit comes from the fact that cover8x8
|
| 291 |
-
// does not support guardband with maximal viewport.
|
| 292 |
-
|
| 293 |
-
int2 p0, p1, p2, lo, hi;
|
| 294 |
-
float3 rcpW;
|
| 295 |
-
|
| 296 |
-
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
|
| 297 |
-
S32 loxy = ::min(lo.x, lo.y);
|
| 298 |
-
S32 hixy = ::max(hi.x, hi.y);
|
| 299 |
-
S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
|
| 300 |
-
|
| 301 |
-
if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
|
| 302 |
-
{
|
| 303 |
-
int2 d1, d2;
|
| 304 |
-
S32 area;
|
| 305 |
-
bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
|
| 306 |
-
triSubtris[taskIdx] = res ? 1 : 0;
|
| 307 |
-
|
| 308 |
-
if (res)
|
| 309 |
-
setupTriangle(
|
| 310 |
-
p,
|
| 311 |
-
&triHeader[taskIdx], &triData[taskIdx], vidx.w,
|
| 312 |
-
v0.z, v1.z, v2.z,
|
| 313 |
-
p0, p1, p2, rcpW,
|
| 314 |
-
d1, d2, area);
|
| 315 |
-
|
| 316 |
-
return;
|
| 317 |
-
}
|
| 318 |
-
}
|
| 319 |
-
|
| 320 |
-
// Clip to view frustum.
|
| 321 |
-
|
| 322 |
-
float4 ov0 = v0;
|
| 323 |
-
float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
|
| 324 |
-
float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
|
| 325 |
-
int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
|
| 326 |
-
|
| 327 |
-
// Count non-culled subtriangles.
|
| 328 |
-
|
| 329 |
-
v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
|
| 330 |
-
v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
|
| 331 |
-
v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
|
| 332 |
-
v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
|
| 333 |
-
v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
|
| 334 |
-
v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
|
| 335 |
-
v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
|
| 336 |
-
v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
|
| 337 |
-
float4 tv1 = v1;
|
| 338 |
-
|
| 339 |
-
int numSubtris = 0;
|
| 340 |
-
for (int i = 2; i < numVerts; i++)
|
| 341 |
-
{
|
| 342 |
-
v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
|
| 343 |
-
v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
|
| 344 |
-
v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
|
| 345 |
-
v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
|
| 346 |
-
|
| 347 |
-
int2 p0, p1, p2, lo, hi, d1, d2;
|
| 348 |
-
float3 rcpW;
|
| 349 |
-
S32 area;
|
| 350 |
-
|
| 351 |
-
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
|
| 352 |
-
if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
|
| 353 |
-
numSubtris++;
|
| 354 |
-
|
| 355 |
-
v1 = v2;
|
| 356 |
-
}
|
| 357 |
-
|
| 358 |
-
triSubtris[taskIdx] = numSubtris;
|
| 359 |
-
|
| 360 |
-
// Multiple subtriangles => allocate.
|
| 361 |
-
|
| 362 |
-
int subtriBase = taskIdx;
|
| 363 |
-
if (numSubtris > 1)
|
| 364 |
-
{
|
| 365 |
-
subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
|
| 366 |
-
triHeader[taskIdx].misc = subtriBase;
|
| 367 |
-
if (subtriBase + numSubtris > p.maxSubtris)
|
| 368 |
-
numVerts = 0;
|
| 369 |
-
}
|
| 370 |
-
|
| 371 |
-
// Setup subtriangles.
|
| 372 |
-
|
| 373 |
-
v1 = tv1;
|
| 374 |
-
for (int i = 2; i < numVerts; i++)
|
| 375 |
-
{
|
| 376 |
-
v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
|
| 377 |
-
v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
|
| 378 |
-
v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
|
| 379 |
-
v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
|
| 380 |
-
|
| 381 |
-
int2 p0, p1, p2, lo, hi, d1, d2;
|
| 382 |
-
float3 rcpW;
|
| 383 |
-
S32 area;
|
| 384 |
-
|
| 385 |
-
snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
|
| 386 |
-
if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
|
| 387 |
-
{
|
| 388 |
-
setupTriangle(
|
| 389 |
-
p,
|
| 390 |
-
&triHeader[subtriBase], &triData[subtriBase], vidx.w,
|
| 391 |
-
v0.z, v1.z, v2.z,
|
| 392 |
-
p0, p1, p2, rcpW,
|
| 393 |
-
d1, d2, area);
|
| 394 |
-
|
| 395 |
-
subtriBase++;
|
| 396 |
-
}
|
| 397 |
-
|
| 398 |
-
v1 = v2;
|
| 399 |
-
}
|
| 400 |
-
}
|
| 401 |
-
|
| 402 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl
DELETED
|
@@ -1,452 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#include "PrivateDefs.hpp"
|
| 10 |
-
|
| 11 |
-
namespace CR
|
| 12 |
-
{
|
| 13 |
-
//------------------------------------------------------------------------
|
| 14 |
-
|
| 15 |
-
template<class T> __device__ __inline__ void swap(T& a, T& b) { T t = a; a = b; b = t; }
|
| 16 |
-
|
| 17 |
-
__device__ __inline__ U32 getLo (U64 a) { return __double2loint(__longlong_as_double(a)); }
|
| 18 |
-
__device__ __inline__ S32 getLo (S64 a) { return __double2loint(__longlong_as_double(a)); }
|
| 19 |
-
__device__ __inline__ U32 getHi (U64 a) { return __double2hiint(__longlong_as_double(a)); }
|
| 20 |
-
__device__ __inline__ S32 getHi (S64 a) { return __double2hiint(__longlong_as_double(a)); }
|
| 21 |
-
__device__ __inline__ U64 combineLoHi (U32 lo, U32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
|
| 22 |
-
__device__ __inline__ S64 combineLoHi (S32 lo, S32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
|
| 23 |
-
__device__ __inline__ U32 getLaneMaskLt (void) { U32 r; asm("mov.u32 %0, %lanemask_lt;" : "=r"(r)); return r; }
|
| 24 |
-
__device__ __inline__ U32 getLaneMaskLe (void) { U32 r; asm("mov.u32 %0, %lanemask_le;" : "=r"(r)); return r; }
|
| 25 |
-
__device__ __inline__ U32 getLaneMaskGt (void) { U32 r; asm("mov.u32 %0, %lanemask_gt;" : "=r"(r)); return r; }
|
| 26 |
-
__device__ __inline__ U32 getLaneMaskGe (void) { U32 r; asm("mov.u32 %0, %lanemask_ge;" : "=r"(r)); return r; }
|
| 27 |
-
__device__ __inline__ int findLeadingOne (U32 v) { U32 r; asm("bfind.u32 %0, %1;" : "=r"(r) : "r"(v)); return r; }
|
| 28 |
-
__device__ __inline__ bool singleLane (void) { return ((::__ballot_sync(~0u, true) & getLaneMaskLt()) == 0); }
|
| 29 |
-
|
| 30 |
-
__device__ __inline__ void add_add_carry (U32& rlo, U32 alo, U32 blo, U32& rhi, U32 ahi, U32 bhi) { U64 r = combineLoHi(alo, ahi) + combineLoHi(blo, bhi); rlo = getLo(r); rhi = getHi(r); }
|
| 31 |
-
__device__ __inline__ S32 f32_to_s32_sat (F32 a) { S32 v; asm("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
|
| 32 |
-
__device__ __inline__ U32 f32_to_u32_sat (F32 a) { U32 v; asm("cvt.rni.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
|
| 33 |
-
__device__ __inline__ U32 f32_to_u32_sat_rmi (F32 a) { U32 v; asm("cvt.rmi.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
|
| 34 |
-
__device__ __inline__ U32 f32_to_u8_sat (F32 a) { U32 v; asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
|
| 35 |
-
__device__ __inline__ S64 f32_to_s64 (F32 a) { S64 v; asm("cvt.rni.s64.f32 %0, %1;" : "=l"(v) : "f"(a)); return v; }
|
| 36 |
-
__device__ __inline__ S32 add_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 37 |
-
__device__ __inline__ S32 add_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 38 |
-
__device__ __inline__ S32 add_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 39 |
-
__device__ __inline__ S32 add_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 40 |
-
__device__ __inline__ S32 sub_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 41 |
-
__device__ __inline__ S32 sub_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 42 |
-
__device__ __inline__ S32 sub_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 43 |
-
__device__ __inline__ S32 sub_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 44 |
-
__device__ __inline__ S32 sub_u16lo_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 45 |
-
__device__ __inline__ S32 sub_u16hi_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 46 |
-
__device__ __inline__ S32 sub_u16lo_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 47 |
-
__device__ __inline__ S32 sub_u16hi_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 48 |
-
__device__ __inline__ U32 add_b0 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b0, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 49 |
-
__device__ __inline__ U32 add_b1 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 50 |
-
__device__ __inline__ U32 add_b2 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b2, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 51 |
-
__device__ __inline__ U32 add_b3 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b3, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 52 |
-
__device__ __inline__ U32 vmad_b0 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 53 |
-
__device__ __inline__ U32 vmad_b1 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 54 |
-
__device__ __inline__ U32 vmad_b2 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 55 |
-
__device__ __inline__ U32 vmad_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 56 |
-
__device__ __inline__ U32 vmad_b0_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 57 |
-
__device__ __inline__ U32 vmad_b1_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 58 |
-
__device__ __inline__ U32 vmad_b2_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 59 |
-
__device__ __inline__ U32 vmad_b3_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 60 |
-
__device__ __inline__ U32 add_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vadd.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
|
| 61 |
-
__device__ __inline__ U32 sub_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vsub.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
|
| 62 |
-
__device__ __inline__ S32 max_max (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 63 |
-
__device__ __inline__ S32 min_min (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 64 |
-
__device__ __inline__ S32 max_add (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 65 |
-
__device__ __inline__ S32 min_add (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 66 |
-
__device__ __inline__ U32 add_add (U32 a, U32 b, U32 c) { U32 v; asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 67 |
-
__device__ __inline__ U32 sub_add (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 68 |
-
__device__ __inline__ U32 add_sub (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(c), "r"(b)); return v; }
|
| 69 |
-
__device__ __inline__ S32 add_clamp_0_x (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 70 |
-
__device__ __inline__ S32 add_clamp_b0 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 71 |
-
__device__ __inline__ S32 add_clamp_b2 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b2, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 72 |
-
__device__ __inline__ U32 prmt (U32 a, U32 b, U32 c) { U32 v; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 73 |
-
__device__ __inline__ S32 u32lo_sext (U32 a) { U32 v; asm("cvt.s16.u32 %0, %1;" : "=r"(v) : "r"(a)); return v; }
|
| 74 |
-
__device__ __inline__ U32 slct (U32 a, U32 b, S32 c) { U32 v; asm("slct.u32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 75 |
-
__device__ __inline__ S32 slct (S32 a, S32 b, S32 c) { S32 v; asm("slct.s32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
|
| 76 |
-
__device__ __inline__ F32 slct (F32 a, F32 b, S32 c) { F32 v; asm("slct.f32.s32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "r"(c)); return v; }
|
| 77 |
-
__device__ __inline__ U32 isetge (S32 a, S32 b) { U32 v; asm("set.ge.u32.s32 %0, %1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
|
| 78 |
-
__device__ __inline__ F64 rcp_approx (F64 a) { F64 v; asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(v) : "d"(a)); return v; }
|
| 79 |
-
__device__ __inline__ F32 fma_rm (F32 a, F32 b, F32 c) { F32 v; asm("fma.rm.f32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "f"(c)); return v; }
|
| 80 |
-
__device__ __inline__ U32 idiv_fast (U32 a, U32 b);
|
| 81 |
-
|
| 82 |
-
__device__ __inline__ uint3 setupPleq (float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp);
|
| 83 |
-
|
| 84 |
-
__device__ __inline__ void cover8x8_setupLUT (volatile U64* lut);
|
| 85 |
-
__device__ __inline__ U64 cover8x8_exact_fast (S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut); // Assumes viewport <= 2^11, subpixels <= 2^4, no guardband.
|
| 86 |
-
__device__ __inline__ U64 cover8x8_lookupMask (S64 yinit, U32 yinc, U32 flips, volatile const U64* lut);
|
| 87 |
-
|
| 88 |
-
__device__ __inline__ U64 cover8x8_exact_noLUT (S32 ox, S32 oy, S32 dx, S32 dy); // optimized reference implementation, does not require look-up table
|
| 89 |
-
__device__ __inline__ U64 cover8x8_conservative_noLUT (S32 ox, S32 oy, S32 dx, S32 dy);
|
| 90 |
-
__device__ __inline__ U64 cover8x8_generateMask_noLUT (S32 curr, S32 dx, S32 dy);
|
| 91 |
-
|
| 92 |
-
template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems); // Assumes that numItems <= threadsInBlock. Must sync before & after the call.
|
| 93 |
-
|
| 94 |
-
__device__ __inline__ const CRImageParams& getImageParams(const CRParams& p, int idx)
|
| 95 |
-
{
|
| 96 |
-
return (idx < CR_EMBED_IMAGE_PARAMS) ? p.imageParamsFirst[idx] : p.imageParamsExtra[idx - CR_EMBED_IMAGE_PARAMS];
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
//------------------------------------------------------------------------
|
| 100 |
-
|
| 101 |
-
__device__ __inline__ int clipPolygonWithPlane(F32* baryOut, const F32* baryIn, int numIn, F32 v0, F32 v1, F32 v2)
|
| 102 |
-
{
|
| 103 |
-
int numOut = 0;
|
| 104 |
-
if (numIn >= 3)
|
| 105 |
-
{
|
| 106 |
-
int ai = (numIn - 1) * 2;
|
| 107 |
-
F32 av = v0 + v1 * baryIn[ai + 0] + v2 * baryIn[ai + 1];
|
| 108 |
-
for (int bi = 0; bi < numIn * 2; bi += 2)
|
| 109 |
-
{
|
| 110 |
-
F32 bv = v0 + v1 * baryIn[bi + 0] + v2 * baryIn[bi + 1];
|
| 111 |
-
if (av * bv < 0.0f)
|
| 112 |
-
{
|
| 113 |
-
F32 bc = av / (av - bv);
|
| 114 |
-
F32 ac = 1.0f - bc;
|
| 115 |
-
baryOut[numOut + 0] = baryIn[ai + 0] * ac + baryIn[bi + 0] * bc;
|
| 116 |
-
baryOut[numOut + 1] = baryIn[ai + 1] * ac + baryIn[bi + 1] * bc;
|
| 117 |
-
numOut += 2;
|
| 118 |
-
}
|
| 119 |
-
if (bv >= 0.0f)
|
| 120 |
-
{
|
| 121 |
-
baryOut[numOut + 0] = baryIn[bi + 0];
|
| 122 |
-
baryOut[numOut + 1] = baryIn[bi + 1];
|
| 123 |
-
numOut += 2;
|
| 124 |
-
}
|
| 125 |
-
ai = bi;
|
| 126 |
-
av = bv;
|
| 127 |
-
}
|
| 128 |
-
}
|
| 129 |
-
return (numOut >> 1);
|
| 130 |
-
}
|
| 131 |
-
|
| 132 |
-
//------------------------------------------------------------------------
|
| 133 |
-
|
| 134 |
-
__device__ __inline__ int clipTriangleWithFrustum(F32* bary, const F32* v0, const F32* v1, const F32* v2, const F32* d1, const F32* d2)
|
| 135 |
-
{
|
| 136 |
-
int num = 3;
|
| 137 |
-
bary[0] = 0.0f, bary[1] = 0.0f;
|
| 138 |
-
bary[2] = 1.0f, bary[3] = 0.0f;
|
| 139 |
-
bary[4] = 0.0f, bary[5] = 1.0f;
|
| 140 |
-
|
| 141 |
-
if ((v0[3] < fabsf(v0[0])) | (v1[3] < fabsf(v1[0])) | (v2[3] < fabsf(v2[0])))
|
| 142 |
-
{
|
| 143 |
-
F32 temp[18];
|
| 144 |
-
num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[0], d1[3] + d1[0], d2[3] + d2[0]);
|
| 145 |
-
num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[0], d1[3] - d1[0], d2[3] - d2[0]);
|
| 146 |
-
}
|
| 147 |
-
if ((v0[3] < fabsf(v0[1])) | (v1[3] < fabsf(v1[1])) | (v2[3] < fabsf(v2[1])))
|
| 148 |
-
{
|
| 149 |
-
F32 temp[18];
|
| 150 |
-
num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[1], d1[3] + d1[1], d2[3] + d2[1]);
|
| 151 |
-
num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[1], d1[3] - d1[1], d2[3] - d2[1]);
|
| 152 |
-
}
|
| 153 |
-
if ((v0[3] < fabsf(v0[2])) | (v1[3] < fabsf(v1[2])) | (v2[3] < fabsf(v2[2])))
|
| 154 |
-
{
|
| 155 |
-
F32 temp[18];
|
| 156 |
-
num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[2], d1[3] + d1[2], d2[3] + d2[2]);
|
| 157 |
-
num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[2], d1[3] - d1[2], d2[3] - d2[2]);
|
| 158 |
-
}
|
| 159 |
-
return num;
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
//------------------------------------------------------------------------
|
| 163 |
-
|
| 164 |
-
__device__ __inline__ U32 idiv_fast(U32 a, U32 b)
|
| 165 |
-
{
|
| 166 |
-
return f32_to_u32_sat_rmi(((F32)a + 0.5f) / (F32)b);
|
| 167 |
-
}
|
| 168 |
-
|
| 169 |
-
//------------------------------------------------------------------------
|
| 170 |
-
|
| 171 |
-
__device__ __inline__ U32 toABGR(float4 color)
|
| 172 |
-
{
|
| 173 |
-
// 11 instructions: 4*FFMA, 4*F2I, 3*PRMT
|
| 174 |
-
U32 x = f32_to_u32_sat_rmi(fma_rm(color.x, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
|
| 175 |
-
U32 y = f32_to_u32_sat_rmi(fma_rm(color.y, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
|
| 176 |
-
U32 z = f32_to_u32_sat_rmi(fma_rm(color.z, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
|
| 177 |
-
U32 w = f32_to_u32_sat_rmi(fma_rm(color.w, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
|
| 178 |
-
return prmt(prmt(x, y, 0x0073), prmt(z, w, 0x0073), 0x5410);
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
-
//------------------------------------------------------------------------
|
| 182 |
-
// v0 = subpixels relative to the bottom-left sampling point
|
| 183 |
-
|
| 184 |
-
__device__ __inline__ uint3 setupPleq(float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp)
|
| 185 |
-
{
|
| 186 |
-
F32 mx = fmaxf(fmaxf(values.x, values.y), values.z);
|
| 187 |
-
int sh = ::min(::max((__float_as_int(mx) >> 23) - (127 + 22), 0), 8);
|
| 188 |
-
S32 t0 = (U32)values.x >> sh;
|
| 189 |
-
S32 t1 = ((U32)values.y >> sh) - t0;
|
| 190 |
-
S32 t2 = ((U32)values.z >> sh) - t0;
|
| 191 |
-
|
| 192 |
-
U32 rcpMant = (__float_as_int(areaRcp) & 0x007FFFFF) | 0x00800000;
|
| 193 |
-
int rcpShift = (23 + 127) - (__float_as_int(areaRcp) >> 23);
|
| 194 |
-
|
| 195 |
-
uint3 pleq;
|
| 196 |
-
S64 xc = ((S64)t1 * d2.y - (S64)t2 * d1.y) * rcpMant;
|
| 197 |
-
S64 yc = ((S64)t2 * d1.x - (S64)t1 * d2.x) * rcpMant;
|
| 198 |
-
pleq.x = (U32)(xc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
|
| 199 |
-
pleq.y = (U32)(yc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
|
| 200 |
-
|
| 201 |
-
S32 centerX = (v0.x * 2 + min_min(d1.x, d2.x, 0) + max_max(d1.x, d2.x, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
|
| 202 |
-
S32 centerY = (v0.y * 2 + min_min(d1.y, d2.y, 0) + max_max(d1.y, d2.y, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
|
| 203 |
-
S32 vcx = v0.x - (centerX << CR_SUBPIXEL_LOG2);
|
| 204 |
-
S32 vcy = v0.y - (centerY << CR_SUBPIXEL_LOG2);
|
| 205 |
-
|
| 206 |
-
pleq.z = t0 << sh;
|
| 207 |
-
pleq.z -= (U32)(((xc >> 13) * vcx + (yc >> 13) * vcy) >> (rcpShift - (sh + 13)));
|
| 208 |
-
pleq.z -= pleq.x * centerX + pleq.y * centerY;
|
| 209 |
-
return pleq;
|
| 210 |
-
}
|
| 211 |
-
|
| 212 |
-
//------------------------------------------------------------------------
|
| 213 |
-
|
| 214 |
-
__device__ __inline__ void cover8x8_setupLUT(volatile U64* lut)
|
| 215 |
-
{
|
| 216 |
-
for (S32 lutIdx = threadIdx.x + blockDim.x * threadIdx.y; lutIdx < CR_COVER8X8_LUT_SIZE; lutIdx += blockDim.x * blockDim.y)
|
| 217 |
-
{
|
| 218 |
-
int half = (lutIdx < (12 << 5)) ? 0 : 1;
|
| 219 |
-
int yint = (lutIdx >> 5) - half * 12 - 3;
|
| 220 |
-
U32 shape = ((lutIdx >> 2) & 7) << (31 - 2);
|
| 221 |
-
S32 slctSwapXY = lutIdx << (31 - 1);
|
| 222 |
-
S32 slctNegX = lutIdx << (31 - 0);
|
| 223 |
-
S32 slctCompl = slctSwapXY ^ slctNegX;
|
| 224 |
-
|
| 225 |
-
U64 mask = 0;
|
| 226 |
-
int xlo = half * 4;
|
| 227 |
-
int xhi = xlo + 4;
|
| 228 |
-
for (int x = xlo; x < xhi; x++)
|
| 229 |
-
{
|
| 230 |
-
int ylo = slct(0, ::max(yint, 0), slctCompl);
|
| 231 |
-
int yhi = slct(::min(yint, 8), 8, slctCompl);
|
| 232 |
-
for (int y = ylo; y < yhi; y++)
|
| 233 |
-
{
|
| 234 |
-
int xx = slct(x, y, slctSwapXY);
|
| 235 |
-
int yy = slct(y, x, slctSwapXY);
|
| 236 |
-
xx = slct(xx, 7 - xx, slctNegX);
|
| 237 |
-
mask |= (U64)1 << (xx + yy * 8);
|
| 238 |
-
}
|
| 239 |
-
yint += shape >> 31;
|
| 240 |
-
shape <<= 1;
|
| 241 |
-
}
|
| 242 |
-
lut[lutIdx] = mask;
|
| 243 |
-
}
|
| 244 |
-
}
|
| 245 |
-
|
| 246 |
-
//------------------------------------------------------------------------
|
| 247 |
-
|
| 248 |
-
__device__ __inline__ U64 cover8x8_exact_fast(S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut) // 52 instr
|
| 249 |
-
{
|
| 250 |
-
F32 yinitBias = (F32)(1 << (31 - CR_MAXVIEWPORT_LOG2 - CR_SUBPIXEL_LOG2 * 2));
|
| 251 |
-
F32 yinitScale = (F32)(1 << (32 - CR_SUBPIXEL_LOG2));
|
| 252 |
-
F32 yincScale = 65536.0f * 65536.0f;
|
| 253 |
-
|
| 254 |
-
S32 slctFlipY = flips << (31 - CR_FLIPBIT_FLIP_Y);
|
| 255 |
-
S32 slctFlipX = flips << (31 - CR_FLIPBIT_FLIP_X);
|
| 256 |
-
S32 slctSwapXY = flips << (31 - CR_FLIPBIT_SWAP_XY);
|
| 257 |
-
|
| 258 |
-
// Evaluate cross product.
|
| 259 |
-
|
| 260 |
-
S32 t = ox * dy - oy * dx;
|
| 261 |
-
F32 det = (F32)slct(t, t - dy * (7 << CR_SUBPIXEL_LOG2), slctFlipX);
|
| 262 |
-
if (flips >= (1 << CR_FLIPBIT_COMPL))
|
| 263 |
-
det = -det;
|
| 264 |
-
|
| 265 |
-
// Represent Y as a function of X.
|
| 266 |
-
|
| 267 |
-
F32 xrcp = 1.0f / (F32)::abs(slct(dx, dy, slctSwapXY));
|
| 268 |
-
F32 yzero = det * yinitScale * xrcp + yinitBias;
|
| 269 |
-
S64 yinit = f32_to_s64(slct(yzero, -yzero, slctFlipY));
|
| 270 |
-
U32 yinc = f32_to_u32_sat((F32)::abs(slct(dy, dx, slctSwapXY)) * xrcp * yincScale);
|
| 271 |
-
|
| 272 |
-
// Lookup.
|
| 273 |
-
|
| 274 |
-
return cover8x8_lookupMask(yinit, yinc, flips, lut);
|
| 275 |
-
}
|
| 276 |
-
|
| 277 |
-
//------------------------------------------------------------------------
|
| 278 |
-
|
| 279 |
-
__device__ __inline__ U64 cover8x8_lookupMask(S64 yinit, U32 yinc, U32 flips, volatile const U64* lut)
|
| 280 |
-
{
|
| 281 |
-
// First half.
|
| 282 |
-
|
| 283 |
-
U32 yfrac = getLo(yinit);
|
| 284 |
-
U32 shape = add_clamp_0_x(getHi(yinit) + 4, 0, 11);
|
| 285 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 286 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 287 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 288 |
-
int oct = flips & ((1 << CR_FLIPBIT_FLIP_X) | (1 << CR_FLIPBIT_SWAP_XY));
|
| 289 |
-
U64 mask = *(U64*)((U8*)lut + oct + (shape << 5));
|
| 290 |
-
|
| 291 |
-
// Second half.
|
| 292 |
-
|
| 293 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 294 |
-
shape = add_clamp_0_x(getHi(yinit) + 4, __popc(shape & 15), 11);
|
| 295 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 296 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 297 |
-
add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
|
| 298 |
-
mask |= *(U64*)((U8*)lut + oct + (shape << 5) + (12 << 8));
|
| 299 |
-
return (flips >= (1 << CR_FLIPBIT_COMPL)) ? ~mask : mask;
|
| 300 |
-
}
|
| 301 |
-
|
| 302 |
-
//------------------------------------------------------------------------
|
| 303 |
-
|
| 304 |
-
__device__ __inline__ U64 cover8x8_exact_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
|
| 305 |
-
{
|
| 306 |
-
S32 curr = ox * dy - oy * dx;
|
| 307 |
-
if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
|
| 308 |
-
return cover8x8_generateMask_noLUT(curr, dx, dy);
|
| 309 |
-
}
|
| 310 |
-
|
| 311 |
-
//------------------------------------------------------------------------
|
| 312 |
-
|
| 313 |
-
__device__ __inline__ U64 cover8x8_conservative_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
|
| 314 |
-
{
|
| 315 |
-
S32 curr = ox * dy - oy * dx;
|
| 316 |
-
if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
|
| 317 |
-
curr += (::abs(dx) + ::abs(dy)) << (CR_SUBPIXEL_LOG2 - 1);
|
| 318 |
-
return cover8x8_generateMask_noLUT(curr, dx, dy);
|
| 319 |
-
}
|
| 320 |
-
|
| 321 |
-
//------------------------------------------------------------------------
|
| 322 |
-
|
| 323 |
-
__device__ __inline__ U64 cover8x8_generateMask_noLUT(S32 curr, S32 dx, S32 dy)
|
| 324 |
-
{
|
| 325 |
-
curr += (dx - dy) * (7 << CR_SUBPIXEL_LOG2);
|
| 326 |
-
S32 stepX = dy << (CR_SUBPIXEL_LOG2 + 1);
|
| 327 |
-
S32 stepYorig = -dx - dy * 7;
|
| 328 |
-
S32 stepY = stepYorig << (CR_SUBPIXEL_LOG2 + 1);
|
| 329 |
-
|
| 330 |
-
U32 hi = isetge(curr, 0);
|
| 331 |
-
U32 frac = curr + curr;
|
| 332 |
-
for (int i = 62; i >= 32; i--)
|
| 333 |
-
add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, hi, hi, hi);
|
| 334 |
-
|
| 335 |
-
U32 lo = 0;
|
| 336 |
-
for (int i = 31; i >= 0; i--)
|
| 337 |
-
add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, lo, lo, lo);
|
| 338 |
-
|
| 339 |
-
lo ^= lo >> 1, hi ^= hi >> 1;
|
| 340 |
-
lo ^= lo >> 2, hi ^= hi >> 2;
|
| 341 |
-
lo ^= lo >> 4, hi ^= hi >> 4;
|
| 342 |
-
lo ^= lo >> 8, hi ^= hi >> 8;
|
| 343 |
-
lo ^= lo >> 16, hi ^= hi >> 16;
|
| 344 |
-
|
| 345 |
-
if (dy < 0)
|
| 346 |
-
{
|
| 347 |
-
lo ^= 0x55AA55AA;
|
| 348 |
-
hi ^= 0x55AA55AA;
|
| 349 |
-
}
|
| 350 |
-
if (stepYorig < 0)
|
| 351 |
-
{
|
| 352 |
-
lo ^= 0xFF00FF00;
|
| 353 |
-
hi ^= 0x00FF00FF;
|
| 354 |
-
}
|
| 355 |
-
if ((hi & 1) != 0)
|
| 356 |
-
lo = ~lo;
|
| 357 |
-
|
| 358 |
-
return combineLoHi(lo, hi);
|
| 359 |
-
}
|
| 360 |
-
|
| 361 |
-
//------------------------------------------------------------------------
|
| 362 |
-
|
| 363 |
-
template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems)
|
| 364 |
-
{
|
| 365 |
-
int thrInBlock = threadIdx.x + threadIdx.y * blockDim.x;
|
| 366 |
-
int range = 16;
|
| 367 |
-
|
| 368 |
-
// Use transposition sort within each 16-wide subrange.
|
| 369 |
-
|
| 370 |
-
int base = thrInBlock * 2;
|
| 371 |
-
bool act = (base < numItems - 1);
|
| 372 |
-
U32 actMask = __ballot_sync(~0u, act);
|
| 373 |
-
if (act)
|
| 374 |
-
{
|
| 375 |
-
bool tryOdd = (base < numItems - 2 && (~base & (range - 2)) != 0);
|
| 376 |
-
T mid = ptr[base + 1];
|
| 377 |
-
|
| 378 |
-
for (int iter = 0; iter < range; iter += 2)
|
| 379 |
-
{
|
| 380 |
-
// Evens.
|
| 381 |
-
|
| 382 |
-
T tmp = ptr[base + 0];
|
| 383 |
-
if (tmp > mid)
|
| 384 |
-
{
|
| 385 |
-
ptr[base + 0] = mid;
|
| 386 |
-
mid = tmp;
|
| 387 |
-
}
|
| 388 |
-
__syncwarp(actMask);
|
| 389 |
-
|
| 390 |
-
// Odds.
|
| 391 |
-
|
| 392 |
-
if (tryOdd)
|
| 393 |
-
{
|
| 394 |
-
tmp = ptr[base + 2];
|
| 395 |
-
if (mid > tmp)
|
| 396 |
-
{
|
| 397 |
-
ptr[base + 2] = mid;
|
| 398 |
-
mid = tmp;
|
| 399 |
-
}
|
| 400 |
-
}
|
| 401 |
-
__syncwarp(actMask);
|
| 402 |
-
}
|
| 403 |
-
ptr[base + 1] = mid;
|
| 404 |
-
}
|
| 405 |
-
|
| 406 |
-
// Multiple subranges => Merge hierarchically.
|
| 407 |
-
|
| 408 |
-
for (; range < numItems; range <<= 1)
|
| 409 |
-
{
|
| 410 |
-
// Assuming that we would insert the current item into the other
|
| 411 |
-
// subrange, use binary search to find the appropriate slot.
|
| 412 |
-
|
| 413 |
-
__syncthreads();
|
| 414 |
-
|
| 415 |
-
T item;
|
| 416 |
-
int slot;
|
| 417 |
-
if (thrInBlock < numItems)
|
| 418 |
-
{
|
| 419 |
-
item = ptr[thrInBlock];
|
| 420 |
-
slot = (thrInBlock & -range) ^ range;
|
| 421 |
-
if (slot < numItems)
|
| 422 |
-
{
|
| 423 |
-
T tmp = ptr[slot];
|
| 424 |
-
bool inclusive = ((thrInBlock & range) != 0);
|
| 425 |
-
if (tmp < item || (inclusive && tmp == item))
|
| 426 |
-
{
|
| 427 |
-
for (int step = (range >> 1); step != 0; step >>= 1)
|
| 428 |
-
{
|
| 429 |
-
int probe = slot + step;
|
| 430 |
-
if (probe < numItems)
|
| 431 |
-
{
|
| 432 |
-
tmp = ptr[probe];
|
| 433 |
-
if (tmp < item || (inclusive && tmp == item))
|
| 434 |
-
slot = probe;
|
| 435 |
-
}
|
| 436 |
-
}
|
| 437 |
-
slot++;
|
| 438 |
-
}
|
| 439 |
-
}
|
| 440 |
-
}
|
| 441 |
-
|
| 442 |
-
// Store the item at an appropriate place.
|
| 443 |
-
|
| 444 |
-
__syncthreads();
|
| 445 |
-
|
| 446 |
-
if (thrInBlock < numItems)
|
| 447 |
-
ptr[slot + (thrInBlock & (range * 2 - 1)) - range] = item;
|
| 448 |
-
}
|
| 449 |
-
}
|
| 450 |
-
|
| 451 |
-
//------------------------------------------------------------------------
|
| 452 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/framework.h
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
|
| 11 |
-
// Framework-specific macros to enable code sharing.
|
| 12 |
-
|
| 13 |
-
//------------------------------------------------------------------------
|
| 14 |
-
// Tensorflow.
|
| 15 |
-
|
| 16 |
-
#ifdef NVDR_TENSORFLOW
|
| 17 |
-
#define EIGEN_USE_GPU
|
| 18 |
-
#include "tensorflow/core/framework/op.h"
|
| 19 |
-
#include "tensorflow/core/framework/op_kernel.h"
|
| 20 |
-
#include "tensorflow/core/framework/shape_inference.h"
|
| 21 |
-
#include "tensorflow/core/platform/default/logging.h"
|
| 22 |
-
using namespace tensorflow;
|
| 23 |
-
using namespace tensorflow::shape_inference;
|
| 24 |
-
#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
|
| 25 |
-
#define NVDR_CTX_PARAMS _nvdr_ctx
|
| 26 |
-
#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
|
| 27 |
-
#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
|
| 28 |
-
#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
|
| 29 |
-
#endif
|
| 30 |
-
|
| 31 |
-
//------------------------------------------------------------------------
|
| 32 |
-
// PyTorch.
|
| 33 |
-
|
| 34 |
-
#ifdef NVDR_TORCH
|
| 35 |
-
#ifndef __CUDACC__
|
| 36 |
-
#include <torch/extension.h>
|
| 37 |
-
#include <ATen/cuda/CUDAContext.h>
|
| 38 |
-
#include <ATen/cuda/CUDAUtils.h>
|
| 39 |
-
#include <c10/cuda/CUDAGuard.h>
|
| 40 |
-
#include <pybind11/numpy.h>
|
| 41 |
-
#endif
|
| 42 |
-
#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
|
| 43 |
-
#define NVDR_CTX_PARAMS 0
|
| 44 |
-
#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
|
| 45 |
-
#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
|
| 46 |
-
#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
|
| 47 |
-
#endif
|
| 48 |
-
|
| 49 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/glutil.cpp
DELETED
|
@@ -1,403 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
//------------------------------------------------------------------------
|
| 10 |
-
// Common.
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
|
| 13 |
-
#include "framework.h"
|
| 14 |
-
#include "glutil.h"
|
| 15 |
-
#include <iostream>
|
| 16 |
-
#include <iomanip>
|
| 17 |
-
|
| 18 |
-
// Create the function pointers.
|
| 19 |
-
#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
|
| 20 |
-
#include "glutil_extlist.h"
|
| 21 |
-
#undef GLUTIL_EXT
|
| 22 |
-
|
| 23 |
-
// Track initialization status.
|
| 24 |
-
static volatile bool s_glExtInitialized = false;
|
| 25 |
-
|
| 26 |
-
// Error strings.
|
| 27 |
-
const char* getGLErrorString(GLenum err)
|
| 28 |
-
{
|
| 29 |
-
switch(err)
|
| 30 |
-
{
|
| 31 |
-
case GL_NO_ERROR: return "GL_NO_ERROR";
|
| 32 |
-
case GL_INVALID_ENUM: return "GL_INVALID_ENUM";
|
| 33 |
-
case GL_INVALID_VALUE: return "GL_INVALID_VALUE";
|
| 34 |
-
case GL_INVALID_OPERATION: return "GL_INVALID_OPERATION";
|
| 35 |
-
case GL_STACK_OVERFLOW: return "GL_STACK_OVERFLOW";
|
| 36 |
-
case GL_STACK_UNDERFLOW: return "GL_STACK_UNDERFLOW";
|
| 37 |
-
case GL_OUT_OF_MEMORY: return "GL_OUT_OF_MEMORY";
|
| 38 |
-
case GL_INVALID_FRAMEBUFFER_OPERATION: return "GL_INVALID_FRAMEBUFFER_OPERATION";
|
| 39 |
-
case GL_TABLE_TOO_LARGE: return "GL_TABLE_TOO_LARGE";
|
| 40 |
-
case GL_CONTEXT_LOST: return "GL_CONTEXT_LOST";
|
| 41 |
-
}
|
| 42 |
-
return "Unknown error";
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
//------------------------------------------------------------------------
|
| 46 |
-
// Windows.
|
| 47 |
-
//------------------------------------------------------------------------
|
| 48 |
-
|
| 49 |
-
#ifdef _WIN32
|
| 50 |
-
|
| 51 |
-
static CRITICAL_SECTION getInitializedCriticalSection(void)
|
| 52 |
-
{
|
| 53 |
-
CRITICAL_SECTION cs;
|
| 54 |
-
InitializeCriticalSection(&cs);
|
| 55 |
-
return cs;
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
|
| 59 |
-
|
| 60 |
-
static void safeGetProcAddress(const char* name, PROC* pfn)
|
| 61 |
-
{
|
| 62 |
-
PROC result = wglGetProcAddress(name);
|
| 63 |
-
if (!result)
|
| 64 |
-
{
|
| 65 |
-
LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
|
| 66 |
-
LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
|
| 67 |
-
exit(1); // Should never get here but make sure we exit.
|
| 68 |
-
}
|
| 69 |
-
*pfn = result;
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
static void initializeGLExtensions(void)
|
| 73 |
-
{
|
| 74 |
-
// Use critical section for thread safety.
|
| 75 |
-
EnterCriticalSection(&s_getProcAddressMutex);
|
| 76 |
-
|
| 77 |
-
// Only dig function pointers if not done already.
|
| 78 |
-
if (!s_glExtInitialized)
|
| 79 |
-
{
|
| 80 |
-
// Generate code to populate the function pointers.
|
| 81 |
-
#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
|
| 82 |
-
#include "glutil_extlist.h"
|
| 83 |
-
#undef GLUTIL_EXT
|
| 84 |
-
|
| 85 |
-
// Mark as initialized.
|
| 86 |
-
s_glExtInitialized = true;
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
// Done.
|
| 90 |
-
LeaveCriticalSection(&s_getProcAddressMutex);
|
| 91 |
-
return;
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
void setGLContext(GLContext& glctx)
|
| 95 |
-
{
|
| 96 |
-
if (!glctx.hglrc)
|
| 97 |
-
LOG(FATAL) << "setGLContext() called with null gltcx";
|
| 98 |
-
if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
|
| 99 |
-
LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
|
| 100 |
-
|
| 101 |
-
if (glctx.extInitialized)
|
| 102 |
-
return;
|
| 103 |
-
initializeGLExtensions();
|
| 104 |
-
glctx.extInitialized = 1;
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
-
void releaseGLContext(void)
|
| 108 |
-
{
|
| 109 |
-
if (!wglMakeCurrent(NULL, NULL))
|
| 110 |
-
LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
extern "C" int set_gpu(const char*); // In setgpu.lib
|
| 114 |
-
GLContext createGLContext(int cudaDeviceIdx)
|
| 115 |
-
{
|
| 116 |
-
if (cudaDeviceIdx >= 0)
|
| 117 |
-
{
|
| 118 |
-
char pciBusId[256] = "";
|
| 119 |
-
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
|
| 120 |
-
if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
|
| 121 |
-
{
|
| 122 |
-
LOG(INFO) << "PCI bus id query failed";
|
| 123 |
-
}
|
| 124 |
-
else
|
| 125 |
-
{
|
| 126 |
-
int res = set_gpu(pciBusId);
|
| 127 |
-
LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
|
| 128 |
-
}
|
| 129 |
-
}
|
| 130 |
-
|
| 131 |
-
HINSTANCE hInstance = GetModuleHandle(NULL);
|
| 132 |
-
WNDCLASS wc = {};
|
| 133 |
-
wc.style = CS_OWNDC;
|
| 134 |
-
wc.lpfnWndProc = DefWindowProc;
|
| 135 |
-
wc.hInstance = hInstance;
|
| 136 |
-
wc.lpszClassName = "__DummyGLClassCPP";
|
| 137 |
-
int res = RegisterClass(&wc);
|
| 138 |
-
|
| 139 |
-
HWND hwnd = CreateWindow(
|
| 140 |
-
"__DummyGLClassCPP", // lpClassName
|
| 141 |
-
"__DummyGLWindowCPP", // lpWindowName
|
| 142 |
-
WS_OVERLAPPEDWINDOW, // dwStyle
|
| 143 |
-
CW_USEDEFAULT, // x
|
| 144 |
-
CW_USEDEFAULT, // y
|
| 145 |
-
0, 0, // nWidth, nHeight
|
| 146 |
-
NULL, NULL, // hWndParent, hMenu
|
| 147 |
-
hInstance, // hInstance
|
| 148 |
-
NULL // lpParam
|
| 149 |
-
);
|
| 150 |
-
|
| 151 |
-
PIXELFORMATDESCRIPTOR pfd = {};
|
| 152 |
-
pfd.dwFlags = PFD_SUPPORT_OPENGL;
|
| 153 |
-
pfd.iPixelType = PFD_TYPE_RGBA;
|
| 154 |
-
pfd.iLayerType = PFD_MAIN_PLANE;
|
| 155 |
-
pfd.cColorBits = 32;
|
| 156 |
-
pfd.cDepthBits = 24;
|
| 157 |
-
pfd.cStencilBits = 8;
|
| 158 |
-
|
| 159 |
-
HDC hdc = GetDC(hwnd);
|
| 160 |
-
int pixelformat = ChoosePixelFormat(hdc, &pfd);
|
| 161 |
-
SetPixelFormat(hdc, pixelformat, &pfd);
|
| 162 |
-
|
| 163 |
-
HGLRC hglrc = wglCreateContext(hdc);
|
| 164 |
-
LOG(INFO) << std::hex << std::setfill('0')
|
| 165 |
-
<< "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
|
| 166 |
-
<< ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
|
| 167 |
-
|
| 168 |
-
GLContext glctx = {hdc, hglrc, 0};
|
| 169 |
-
return glctx;
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
void destroyGLContext(GLContext& glctx)
|
| 173 |
-
{
|
| 174 |
-
if (!glctx.hglrc)
|
| 175 |
-
LOG(FATAL) << "destroyGLContext() called with null gltcx";
|
| 176 |
-
|
| 177 |
-
// If this is the current context, release it.
|
| 178 |
-
if (wglGetCurrentContext() == glctx.hglrc)
|
| 179 |
-
releaseGLContext();
|
| 180 |
-
|
| 181 |
-
HWND hwnd = WindowFromDC(glctx.hdc);
|
| 182 |
-
if (!hwnd)
|
| 183 |
-
LOG(FATAL) << "WindowFromDC() failed";
|
| 184 |
-
if (!ReleaseDC(hwnd, glctx.hdc))
|
| 185 |
-
LOG(FATAL) << "ReleaseDC() failed";
|
| 186 |
-
if (!wglDeleteContext(glctx.hglrc))
|
| 187 |
-
LOG(FATAL) << "wglDeleteContext() failed";
|
| 188 |
-
if (!DestroyWindow(hwnd))
|
| 189 |
-
LOG(FATAL) << "DestroyWindow() failed";
|
| 190 |
-
|
| 191 |
-
LOG(INFO) << std::hex << std::setfill('0')
|
| 192 |
-
<< "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
|
| 193 |
-
<< ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
|
| 194 |
-
|
| 195 |
-
memset(&glctx, 0, sizeof(GLContext));
|
| 196 |
-
}
|
| 197 |
-
|
| 198 |
-
#endif // _WIN32
|
| 199 |
-
|
| 200 |
-
//------------------------------------------------------------------------
|
| 201 |
-
// Linux.
|
| 202 |
-
//------------------------------------------------------------------------
|
| 203 |
-
|
| 204 |
-
#ifdef __linux__
|
| 205 |
-
|
| 206 |
-
static pthread_mutex_t s_getProcAddressMutex;
|
| 207 |
-
|
| 208 |
-
typedef void (*PROCFN)();
|
| 209 |
-
|
| 210 |
-
static void safeGetProcAddress(const char* name, PROCFN* pfn)
|
| 211 |
-
{
|
| 212 |
-
PROCFN result = eglGetProcAddress(name);
|
| 213 |
-
if (!result)
|
| 214 |
-
{
|
| 215 |
-
pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
|
| 216 |
-
LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
|
| 217 |
-
exit(1); // Should never get here but make sure we exit.
|
| 218 |
-
}
|
| 219 |
-
*pfn = result;
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
-
static void initializeGLExtensions(void)
|
| 223 |
-
{
|
| 224 |
-
pthread_mutex_lock(&s_getProcAddressMutex);
|
| 225 |
-
|
| 226 |
-
// Only dig function pointers if not done already.
|
| 227 |
-
if (!s_glExtInitialized)
|
| 228 |
-
{
|
| 229 |
-
// Generate code to populate the function pointers.
|
| 230 |
-
#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
|
| 231 |
-
#include "glutil_extlist.h"
|
| 232 |
-
#undef GLUTIL_EXT
|
| 233 |
-
|
| 234 |
-
// Mark as initialized.
|
| 235 |
-
s_glExtInitialized = true;
|
| 236 |
-
}
|
| 237 |
-
|
| 238 |
-
pthread_mutex_unlock(&s_getProcAddressMutex);
|
| 239 |
-
return;
|
| 240 |
-
}
|
| 241 |
-
|
| 242 |
-
void setGLContext(GLContext& glctx)
|
| 243 |
-
{
|
| 244 |
-
if (!glctx.context)
|
| 245 |
-
LOG(FATAL) << "setGLContext() called with null gltcx";
|
| 246 |
-
|
| 247 |
-
if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
|
| 248 |
-
LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
|
| 249 |
-
|
| 250 |
-
if (glctx.extInitialized)
|
| 251 |
-
return;
|
| 252 |
-
initializeGLExtensions();
|
| 253 |
-
glctx.extInitialized = 1;
|
| 254 |
-
}
|
| 255 |
-
|
| 256 |
-
void releaseGLContext(void)
|
| 257 |
-
{
|
| 258 |
-
EGLDisplay display = eglGetCurrentDisplay();
|
| 259 |
-
if (display == EGL_NO_DISPLAY)
|
| 260 |
-
LOG(WARNING) << "releaseGLContext() called with no active display";
|
| 261 |
-
if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
|
| 262 |
-
LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
-
static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
|
| 266 |
-
{
|
| 267 |
-
typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
|
| 268 |
-
typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
|
| 269 |
-
typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
|
| 270 |
-
|
| 271 |
-
eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
|
| 272 |
-
if (!eglQueryDevicesEXT)
|
| 273 |
-
{
|
| 274 |
-
LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
|
| 275 |
-
return 0;
|
| 276 |
-
}
|
| 277 |
-
|
| 278 |
-
eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
|
| 279 |
-
if (!eglQueryDeviceAttribEXT)
|
| 280 |
-
{
|
| 281 |
-
LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
|
| 282 |
-
return 0;
|
| 283 |
-
}
|
| 284 |
-
|
| 285 |
-
eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
|
| 286 |
-
if (!eglGetPlatformDisplayEXT)
|
| 287 |
-
{
|
| 288 |
-
LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
|
| 289 |
-
return 0;
|
| 290 |
-
}
|
| 291 |
-
|
| 292 |
-
int num_devices = 0;
|
| 293 |
-
eglQueryDevicesEXT(0, 0, &num_devices);
|
| 294 |
-
if (!num_devices)
|
| 295 |
-
return 0;
|
| 296 |
-
|
| 297 |
-
EGLDisplay display = 0;
|
| 298 |
-
EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
|
| 299 |
-
eglQueryDevicesEXT(num_devices, devices, &num_devices);
|
| 300 |
-
for (int i=0; i < num_devices; i++)
|
| 301 |
-
{
|
| 302 |
-
EGLDeviceEXT device = devices[i];
|
| 303 |
-
intptr_t value = -1;
|
| 304 |
-
if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
|
| 305 |
-
{
|
| 306 |
-
display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
|
| 307 |
-
break;
|
| 308 |
-
}
|
| 309 |
-
}
|
| 310 |
-
|
| 311 |
-
free(devices);
|
| 312 |
-
return display;
|
| 313 |
-
}
|
| 314 |
-
|
| 315 |
-
GLContext createGLContext(int cudaDeviceIdx)
|
| 316 |
-
{
|
| 317 |
-
EGLDisplay display = 0;
|
| 318 |
-
|
| 319 |
-
if (cudaDeviceIdx >= 0)
|
| 320 |
-
{
|
| 321 |
-
char pciBusId[256] = "";
|
| 322 |
-
LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
|
| 323 |
-
display = getCudaDisplay(cudaDeviceIdx);
|
| 324 |
-
if (!display)
|
| 325 |
-
LOG(INFO) << "Failed, falling back to default display";
|
| 326 |
-
}
|
| 327 |
-
|
| 328 |
-
if (!display)
|
| 329 |
-
{
|
| 330 |
-
display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
|
| 331 |
-
if (display == EGL_NO_DISPLAY)
|
| 332 |
-
LOG(FATAL) << "eglGetDisplay() failed";
|
| 333 |
-
}
|
| 334 |
-
|
| 335 |
-
EGLint major;
|
| 336 |
-
EGLint minor;
|
| 337 |
-
if (!eglInitialize(display, &major, &minor))
|
| 338 |
-
LOG(FATAL) << "eglInitialize() failed";
|
| 339 |
-
|
| 340 |
-
// Choose configuration.
|
| 341 |
-
|
| 342 |
-
const EGLint context_attribs[] = {
|
| 343 |
-
EGL_RED_SIZE, 8,
|
| 344 |
-
EGL_GREEN_SIZE, 8,
|
| 345 |
-
EGL_BLUE_SIZE, 8,
|
| 346 |
-
EGL_ALPHA_SIZE, 8,
|
| 347 |
-
EGL_DEPTH_SIZE, 24,
|
| 348 |
-
EGL_STENCIL_SIZE, 8,
|
| 349 |
-
EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
|
| 350 |
-
EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
|
| 351 |
-
EGL_NONE
|
| 352 |
-
};
|
| 353 |
-
|
| 354 |
-
EGLConfig config;
|
| 355 |
-
EGLint num_config;
|
| 356 |
-
if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
|
| 357 |
-
LOG(FATAL) << "eglChooseConfig() failed";
|
| 358 |
-
|
| 359 |
-
// Create GL context.
|
| 360 |
-
|
| 361 |
-
if (!eglBindAPI(EGL_OPENGL_API))
|
| 362 |
-
LOG(FATAL) << "eglBindAPI() failed";
|
| 363 |
-
|
| 364 |
-
EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
|
| 365 |
-
if (context == EGL_NO_CONTEXT)
|
| 366 |
-
LOG(FATAL) << "eglCreateContext() failed";
|
| 367 |
-
|
| 368 |
-
// Done.
|
| 369 |
-
|
| 370 |
-
LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
|
| 371 |
-
<< std::hex << std::setfill('0')
|
| 372 |
-
<< std::setw(16) << (uintptr_t)display
|
| 373 |
-
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
|
| 374 |
-
|
| 375 |
-
GLContext glctx = {display, context, 0};
|
| 376 |
-
return glctx;
|
| 377 |
-
}
|
| 378 |
-
|
| 379 |
-
void destroyGLContext(GLContext& glctx)
|
| 380 |
-
{
|
| 381 |
-
if (!glctx.context)
|
| 382 |
-
LOG(FATAL) << "destroyGLContext() called with null gltcx";
|
| 383 |
-
|
| 384 |
-
// If this is the current context, release it.
|
| 385 |
-
if (eglGetCurrentContext() == glctx.context)
|
| 386 |
-
releaseGLContext();
|
| 387 |
-
|
| 388 |
-
if (!eglDestroyContext(glctx.display, glctx.context))
|
| 389 |
-
LOG(ERROR) << "eglDestroyContext() failed";
|
| 390 |
-
|
| 391 |
-
LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
|
| 392 |
-
<< std::hex << std::setfill('0')
|
| 393 |
-
<< std::setw(16) << (uintptr_t)glctx.display
|
| 394 |
-
<< ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
|
| 395 |
-
|
| 396 |
-
memset(&glctx, 0, sizeof(GLContext));
|
| 397 |
-
}
|
| 398 |
-
|
| 399 |
-
//------------------------------------------------------------------------
|
| 400 |
-
|
| 401 |
-
#endif // __linux__
|
| 402 |
-
|
| 403 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/glutil.h
DELETED
|
@@ -1,113 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#pragma once
|
| 10 |
-
|
| 11 |
-
//------------------------------------------------------------------------
|
| 12 |
-
// Windows-specific headers and types.
|
| 13 |
-
//------------------------------------------------------------------------
|
| 14 |
-
|
| 15 |
-
#ifdef _WIN32
|
| 16 |
-
#define NOMINMAX
|
| 17 |
-
#include <windows.h> // Required by gl.h in Windows.
|
| 18 |
-
#define GLAPIENTRY APIENTRY
|
| 19 |
-
|
| 20 |
-
struct GLContext
|
| 21 |
-
{
|
| 22 |
-
HDC hdc;
|
| 23 |
-
HGLRC hglrc;
|
| 24 |
-
int extInitialized;
|
| 25 |
-
};
|
| 26 |
-
|
| 27 |
-
#endif // _WIN32
|
| 28 |
-
|
| 29 |
-
//------------------------------------------------------------------------
|
| 30 |
-
// Linux-specific headers and types.
|
| 31 |
-
//------------------------------------------------------------------------
|
| 32 |
-
|
| 33 |
-
#ifdef __linux__
|
| 34 |
-
#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
|
| 35 |
-
#define MESA_EGL_NO_X11_HEADERS
|
| 36 |
-
#include <EGL/egl.h>
|
| 37 |
-
#include <EGL/eglext.h>
|
| 38 |
-
#define GLAPIENTRY
|
| 39 |
-
|
| 40 |
-
struct GLContext
|
| 41 |
-
{
|
| 42 |
-
EGLDisplay display;
|
| 43 |
-
EGLContext context;
|
| 44 |
-
int extInitialized;
|
| 45 |
-
};
|
| 46 |
-
|
| 47 |
-
#endif // __linux__
|
| 48 |
-
|
| 49 |
-
//------------------------------------------------------------------------
|
| 50 |
-
// OpenGL, CUDA interop, GL extensions.
|
| 51 |
-
//------------------------------------------------------------------------
|
| 52 |
-
#define GL_GLEXT_LEGACY
|
| 53 |
-
#include <GL/gl.h>
|
| 54 |
-
#include <cuda_gl_interop.h>
|
| 55 |
-
|
| 56 |
-
// Constants.
|
| 57 |
-
#ifndef GL_VERSION_1_2
|
| 58 |
-
#define GL_CLAMP_TO_EDGE 0x812F
|
| 59 |
-
#define GL_TEXTURE_3D 0x806F
|
| 60 |
-
#endif
|
| 61 |
-
#ifndef GL_VERSION_1_5
|
| 62 |
-
#define GL_ARRAY_BUFFER 0x8892
|
| 63 |
-
#define GL_DYNAMIC_DRAW 0x88E8
|
| 64 |
-
#define GL_ELEMENT_ARRAY_BUFFER 0x8893
|
| 65 |
-
#endif
|
| 66 |
-
#ifndef GL_VERSION_2_0
|
| 67 |
-
#define GL_FRAGMENT_SHADER 0x8B30
|
| 68 |
-
#define GL_INFO_LOG_LENGTH 0x8B84
|
| 69 |
-
#define GL_LINK_STATUS 0x8B82
|
| 70 |
-
#define GL_VERTEX_SHADER 0x8B31
|
| 71 |
-
#endif
|
| 72 |
-
#ifndef GL_VERSION_3_0
|
| 73 |
-
#define GL_MAJOR_VERSION 0x821B
|
| 74 |
-
#define GL_MINOR_VERSION 0x821C
|
| 75 |
-
#define GL_RGBA32F 0x8814
|
| 76 |
-
#define GL_TEXTURE_2D_ARRAY 0x8C1A
|
| 77 |
-
#endif
|
| 78 |
-
#ifndef GL_VERSION_3_2
|
| 79 |
-
#define GL_GEOMETRY_SHADER 0x8DD9
|
| 80 |
-
#endif
|
| 81 |
-
#ifndef GL_ARB_framebuffer_object
|
| 82 |
-
#define GL_COLOR_ATTACHMENT0 0x8CE0
|
| 83 |
-
#define GL_COLOR_ATTACHMENT1 0x8CE1
|
| 84 |
-
#define GL_DEPTH_STENCIL 0x84F9
|
| 85 |
-
#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
|
| 86 |
-
#define GL_DEPTH24_STENCIL8 0x88F0
|
| 87 |
-
#define GL_FRAMEBUFFER 0x8D40
|
| 88 |
-
#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
|
| 89 |
-
#define GL_UNSIGNED_INT_24_8 0x84FA
|
| 90 |
-
#endif
|
| 91 |
-
#ifndef GL_ARB_imaging
|
| 92 |
-
#define GL_TABLE_TOO_LARGE 0x8031
|
| 93 |
-
#endif
|
| 94 |
-
#ifndef GL_KHR_robustness
|
| 95 |
-
#define GL_CONTEXT_LOST 0x0507
|
| 96 |
-
#endif
|
| 97 |
-
|
| 98 |
-
// Declare function pointers to OpenGL extension functions.
|
| 99 |
-
#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
|
| 100 |
-
#include "glutil_extlist.h"
|
| 101 |
-
#undef GLUTIL_EXT
|
| 102 |
-
|
| 103 |
-
//------------------------------------------------------------------------
|
| 104 |
-
// Common functions.
|
| 105 |
-
//------------------------------------------------------------------------
|
| 106 |
-
|
| 107 |
-
void setGLContext (GLContext& glctx);
|
| 108 |
-
void releaseGLContext (void);
|
| 109 |
-
GLContext createGLContext (int cudaDeviceIdx);
|
| 110 |
-
void destroyGLContext (GLContext& glctx);
|
| 111 |
-
const char* getGLErrorString (GLenum err);
|
| 112 |
-
|
| 113 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extensions/nvdiffrast/nvdiffrast/common/glutil_extlist.h
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
-
//
|
| 3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
| 4 |
-
// and proprietary rights in and to this software, related documentation
|
| 5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
| 6 |
-
// distribution of this software and related documentation without an express
|
| 7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
| 8 |
-
|
| 9 |
-
#ifndef GL_VERSION_1_2
|
| 10 |
-
GLUTIL_EXT(void, glTexImage3D, GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
|
| 11 |
-
#endif
|
| 12 |
-
#ifndef GL_VERSION_1_5
|
| 13 |
-
GLUTIL_EXT(void, glBindBuffer, GLenum target, GLuint buffer);
|
| 14 |
-
GLUTIL_EXT(void, glBufferData, GLenum target, ptrdiff_t size, const void* data, GLenum usage);
|
| 15 |
-
GLUTIL_EXT(void, glGenBuffers, GLsizei n, GLuint* buffers);
|
| 16 |
-
#endif
|
| 17 |
-
#ifndef GL_VERSION_2_0
|
| 18 |
-
GLUTIL_EXT(void, glAttachShader, GLuint program, GLuint shader);
|
| 19 |
-
GLUTIL_EXT(void, glCompileShader, GLuint shader);
|
| 20 |
-
GLUTIL_EXT(GLuint, glCreateProgram, void);
|
| 21 |
-
GLUTIL_EXT(GLuint, glCreateShader, GLenum type);
|
| 22 |
-
GLUTIL_EXT(void, glDrawBuffers, GLsizei n, const GLenum* bufs);
|
| 23 |
-
GLUTIL_EXT(void, glEnableVertexAttribArray, GLuint index);
|
| 24 |
-
GLUTIL_EXT(void, glGetProgramInfoLog, GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
|
| 25 |
-
GLUTIL_EXT(void, glGetProgramiv, GLuint program, GLenum pname, GLint* param);
|
| 26 |
-
GLUTIL_EXT(void, glLinkProgram, GLuint program);
|
| 27 |
-
GLUTIL_EXT(void, glShaderSource, GLuint shader, GLsizei count, const char *const* string, const GLint* length);
|
| 28 |
-
GLUTIL_EXT(void, glUniform1f, GLint location, GLfloat v0);
|
| 29 |
-
GLUTIL_EXT(void, glUniform2f, GLint location, GLfloat v0, GLfloat v1);
|
| 30 |
-
GLUTIL_EXT(void, glUseProgram, GLuint program);
|
| 31 |
-
GLUTIL_EXT(void, glVertexAttribPointer, GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
|
| 32 |
-
#endif
|
| 33 |
-
#ifndef GL_VERSION_3_2
|
| 34 |
-
GLUTIL_EXT(void, glFramebufferTexture, GLenum target, GLenum attachment, GLuint texture, GLint level);
|
| 35 |
-
#endif
|
| 36 |
-
#ifndef GL_ARB_framebuffer_object
|
| 37 |
-
GLUTIL_EXT(void, glBindFramebuffer, GLenum target, GLuint framebuffer);
|
| 38 |
-
GLUTIL_EXT(void, glGenFramebuffers, GLsizei n, GLuint* framebuffers);
|
| 39 |
-
#endif
|
| 40 |
-
#ifndef GL_ARB_vertex_array_object
|
| 41 |
-
GLUTIL_EXT(void, glBindVertexArray, GLuint array);
|
| 42 |
-
GLUTIL_EXT(void, glGenVertexArrays, GLsizei n, GLuint* arrays);
|
| 43 |
-
#endif
|
| 44 |
-
#ifndef GL_ARB_multi_draw_indirect
|
| 45 |
-
GLUTIL_EXT(void, glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
|
| 46 |
-
#endif
|
| 47 |
-
|
| 48 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|