用户您好,请详细描述您所遇到的问题,这会帮助我们快速定位问题~
2023-07-11 11:12:58,659 INFO [logger.py:147] Node[0] ==================================================BEGIN QAT STAGE==================================================
2023-07-11 11:12:58,700 INFO [thread_init.py:38] Node[0] init torch_num_thread is `12`,opencv_num_thread is `12`,openblas_num_thread is `12`,mkl_num_thread is `12`,omp_num_thread is `12`,
2023-07-11 11:12:58,926 INFO [converters.py:56] Node[0] Successfully convert float model to qat model.
2023-07-11 11:12:58,927 WARNING [hash.py:218] Node[0] Don not found hash value in name of /open_explorer/work_dir/114/qat_train_hat_env/calibration_checkpoint/calibration-checkpoint-last.pth.tar, will skip check hash...
2023-07-11 11:12:58,956 WARNING [checkpoint.py:44] Node[0] module. is not at the beginning of state dict
2023-07-11 11:12:59,035 INFO [checkpoint.py:177] Node[0] state_dict in checkpoint num: 1076
2023-07-11 11:12:59,039 INFO [checkpoint.py:178] Node[0] state_dict in model num: 1076
2023-07-11 11:12:59,039 WARNING [checkpoint.py:179] Node[0] miss_key num: 0
2023-07-11 11:12:59,039 WARNING [checkpoint.py:182] Node[0] unexpect_key num: 0
2023-07-11 11:12:59,039 INFO [converters.py:248] Node[0] Load the checkpoint successfully from /open_explorer/work_dir/114/qat_train_hat_env/calibration_checkpoint/calibration-checkpoint-last.pth.tar
2023-07-11 11:13:03,578 INFO [loop_base.py:372] Node[0] Start DistributedDataParallelTrainer loop from epoch 0, num_epochs=10
2023-07-11 11:13:03,579 INFO [grad_scale.py:54] Node[0] [GradScale] []
2023-07-11 11:13:03,581 INFO [monitor.py:107] Node[0] Epoch[0] Begin ==================================================
2023-07-11 11:13:30,340 ERROR [ddp_trainer.py:363] Node[0] Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/hat/engine/ddp_trainer.py", line 359, in _with_exception
fn(*args)
File "/open_explorer/project/ObjectDetection/tools/train.py", line 187, in train_entrance
trainer.fit()
File "/usr/local/lib/python3.8/dist-packages/hat/engine/loop_base.py", line 433, in fit
self.batch_processor(
File "/usr/local/lib/python3.8/dist-packages/hat/engine/processors/processor.py", line 442, in __call__
model_outs = model(*_as_list(batch_i))
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/models/structures/detectors/centerpoint.py", line 99, in forward
input_features = self.reader(
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/models/task_modules/lidar/pillar_encoder.py", line 222, in forward
features = self._extract_feature(features)
File "/usr/local/lib/python3.8/dist-packages/hat/models/task_modules/lidar/pillar_encoder.py", line 237, in _extract_feature
features = pfn(features)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/models/task_modules/lidar/pillar_encoder.py", line 81, in forward
x = self.linear(inputs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/horizon_plugin_pytorch/nn/qat/conv2d.py", line 221, in forward
return self.activation_post_process(out)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/horizon_plugin_pytorch/quantization/fake_quantize.py", line 207, in forward
self.activation_post_process(x.detach())
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/hat/utils/module_patch.py", line 46, in _wrap
return fn(self, *args, **kwargs)
File "/root/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/horizon_plugin_pytorch/quantization/observer.py", line 322, in forward
(self.min_val, self.max_val,) = compute_moving_average(
RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)
nvrtc compilation failed:
#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)
template<typename T>
__device__ T maximum(T a, T b) {
return isnan(a) ? a : (a > b ? a : b);
}
template<typename T>
__device__ T minimum(T a, T b) {
return isnan(a) ? a : (a < b ? a : b);
}
extern "C" __global__
void fused_sub_mul_add_sub_mul_add(float* told_min_1, double vaveraging_constant_1, float* tcurrent_min_1, float* told_max_1, float* tcurrent_max_1, float* aten_add_1, float* aten_add) {
{
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<64ll ? 1 : 0) {
float told_max_1_1 = __ldg(told_max_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
float v = __ldg(tcurrent_max_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = told_max_1_1 + (v - told_max_1_1) * (float)(vaveraging_constant_1);
float told_min_1_1 = __ldg(told_min_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
float v_1 = __ldg(tcurrent_min_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
aten_add_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = told_min_1_1 + (v_1 - told_min_1_1) * (float)(vaveraging_constant_1);
}}
}
ERROR:__main__:launch trainer failed! process 0 terminated with exit code 1
Traceback (most recent call last):
File "/open_explorer/project/ObjectDetection/tools/train.py", line 278, in <module>
train(
File "/open_explorer/project/ObjectDetection/tools/train.py", line 273, in train
raise e
File "/open_explorer/project/ObjectDetection/tools/train.py", line 256, in train
launch(
File "/usr/local/lib/python3.8/dist-packages/hat/engine/ddp_trainer.py", line 328, in launch
mp.spawn(
File "/root/.local/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/root/.local/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/root/.local/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 139, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1

