引言

在深度学习模型部署阶段,常常需要考虑硬件资源问题,如显存大小,IO读写速度CPU使用率,内存占用等问题,对于使用CPU环境的模型,这些问题显得更加重要。

这篇主要是针对 ONNXRUNTIME模型在 CPU环境下 C++部署时出现的CPU占用过高以及内存问题进行说明,并尝试解决。主要包含三个部分:

  • 编写占用CPU高的小程序,便于测试
  • 编写获取系统CPU使用率
  • 编写限制CPU使用率
  • 一些项目过程中的经验

Demo

主要通过循环的方式,占用CPU,另外由于处理器多核的缘故,需要创建线程来实现所有核心CPU的使用率跑满。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// windows
#ifdef _WIN32
#define(_WIN32)
#include <windows.h>
#include <iostream>
#include <cstdlib>

using namespace std;

DWORD WINAPI FunProc(LPVOID lpParameter)
{
float * p;

while (p == p)
{
(*p++)+1.144564542315345;
}
return 0;
}

int main(int argc, char **argv)
{
while(1){
HANDLE hThread;
for (int i = 0; i < 4; i++)
{
hThread = CreateThread(NULL, 0, FunProc, &i, 0, NULL);
CloseHandle(hThread);
}
}
system("pause");
return 0;
}
#elif defined(__GNUC__)
#include <unistd.h>

int main() {
for(;;) { fork(); }
}
#endif

获取CPU信息

通过获取对应进程的 pid 实时监控CPU使用率,该例子支持跨平台。

  • linux下其实是通过读取并解析/proc目录下进程虚拟文件对应字段值计算得到
  • windows下调用系统api计算得到
  • 这种打点采样获取cpu和内存占用的方式数据跟用系统管理器查看到的不完全一致
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#include <iostream>
#include <thread>
#include <chrono>
#include <string.h>

#ifdef WIN32
#include <windows.h>
#include <psapi.h>
//#include <tlhelp32.h>
#include <direct.h>
#include <process.h>
#else
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <unistd.h>
#endif

// get current process pid
inline int GetCurrentPid()
{
return getpid();
}

// get specific process cpu occupation ratio by pid
#ifdef WIN32
//
static uint64_t convert_time_format(const FILETIME* ftime)
{
LARGE_INTEGER li;

li.LowPart = ftime->dwLowDateTime;
li.HighPart = ftime->dwHighDateTime;
return li.QuadPart;
}
#else
// FIXME: can also get cpu and mem status from popen cmd
// the info line num in /proc/{pid}/status file
#define VMRSS_LINE 22
#define PROCESS_ITEM 14

static const char* get_items(const char* buffer, unsigned int item)
{
// read from buffer by offset
const char* p = buffer;

int len = strlen(buffer);
int count = 0;

for (int i = 0; i < len; i++)
{
if (' ' == *p)
{
count++;
if (count == item - 1)
{
p++;
break;
}
}
p++;
}

return p;
}

static inline unsigned long get_cpu_total_occupy()
{
// get total cpu use time

// different mode cpu occupy time
unsigned long user_time;
unsigned long nice_time;
unsigned long system_time;
unsigned long idle_time;

FILE* fd;
char buff[1024] = { 0 };

fd = fopen("/proc/stat", "r");
if (nullptr == fd)
return 0;

fgets(buff, sizeof(buff), fd);
char name[64] = { 0 };
sscanf(buff, "%s %ld %ld %ld %ld", name, &user_time, &nice_time, &system_time, &idle_time);
fclose(fd);

return (user_time + nice_time + system_time + idle_time);
}

static inline unsigned long get_cpu_proc_occupy(int pid)
{
// get specific pid cpu use time
unsigned int tmp_pid;
unsigned long utime; // user time
unsigned long stime; // kernel time
unsigned long cutime; // all user time
unsigned long cstime; // all dead time

char file_name[64] = { 0 };
FILE* fd;
char line_buff[1024] = { 0 };
sprintf(file_name, "/proc/%d/stat", pid);

fd = fopen(file_name, "r");
if (nullptr == fd)
return 0;

fgets(line_buff, sizeof(line_buff), fd);

sscanf(line_buff, "%u", &tmp_pid);
const char* q = get_items(line_buff, PROCESS_ITEM);
sscanf(q, "%ld %ld %ld %ld", &utime, &stime, &cutime, &cstime);
fclose(fd);

return (utime + stime + cutime + cstime);
}
#endif

inline float GetCpuUsageRatio(int pid)
{
#ifdef WIN32
static int64_t last_time = 0;
static int64_t last_system_time = 0;

FILETIME now;
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
int64_t system_time;
int64_t time;
int64_t system_time_delta;
int64_t time_delta;

// get cpu num
SYSTEM_INFO info;
GetSystemInfo(&info);
int cpu_num = info.dwNumberOfProcessors;

float cpu_ratio = 0.0;

// get process hanlde by pid
HANDLE process = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
// use GetCurrentProcess() can get current process and no need to close handle

// get now time
GetSystemTimeAsFileTime(&now);

if (!GetProcessTimes(process, &creation_time, &exit_time, &kernel_time, &user_time))
{
// We don't assert here because in some cases (such as in the Task Manager)
// we may call this function on a process that has just exited but we have
// not yet received the notification.
printf("GetCpuUsageRatio GetProcessTimes failed\n");
return 0.0;
}

// should handle the multiple cpu num
system_time = (convert_time_format(&kernel_time) + convert_time_format(&user_time)) / cpu_num;
time = convert_time_format(&now);

if ((last_system_time == 0) || (last_time == 0))
{
// First call, just set the last values.
last_system_time = system_time;
last_time = time;
return 0.0;
}

system_time_delta = system_time - last_system_time;
time_delta = time - last_time;

CloseHandle(process);

if (time_delta == 0)
{
printf("GetCpuUsageRatio time_delta is 0, error\n");
return 0.0;
}

// We add time_delta / 2 so the result is rounded.
cpu_ratio = (int)((system_time_delta * 100 + time_delta / 2) / time_delta); // the % unit
last_system_time = system_time;
last_time = time;

cpu_ratio /= 100.0; // convert to float number

return cpu_ratio;
#else
unsigned long totalcputime1, totalcputime2;
unsigned long procputime1, procputime2;

totalcputime1 = get_cpu_total_occupy();
procputime1 = get_cpu_proc_occupy(pid);

// FIXME: the 200ms is a magic number, works well
usleep(200000); // sleep 200ms to fetch two time point cpu usage snapshots sample for later calculation

totalcputime2 = get_cpu_total_occupy();
procputime2 = get_cpu_proc_occupy(pid);

float pcpu = 0.0;
if (0 != totalcputime2 - totalcputime1)
pcpu = (procputime2 - procputime1) / float(totalcputime2 - totalcputime1); // float number

int cpu_num = get_nprocs();
pcpu *= cpu_num; // should multiply cpu num in multiple cpu machine

return pcpu;
#endif
}

// get specific process physical memeory occupation size by pid (MB)
inline float GetMemoryUsage(int pid)
{
#ifdef WIN32
uint64_t mem = 0, vmem = 0;
PROCESS_MEMORY_COUNTERS pmc;

// get process hanlde by pid
HANDLE process = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
if (GetProcessMemoryInfo(process, &pmc, sizeof(pmc)))
{
mem = pmc.WorkingSetSize;
vmem = pmc.PagefileUsage;
}
CloseHandle(process);

// use GetCurrentProcess() can get current process and no need to close handle

// convert mem from B to MB
return mem / 1024.0 / 1024.0;

#else
char file_name[64] = { 0 };
FILE* fd;
char line_buff[512] = { 0 };
sprintf(file_name, "/proc/%d/status", pid);

fd = fopen(file_name, "r");
if (nullptr == fd)
return 0;

char name[64];
int vmrss = 0;
for (int i = 0; i < VMRSS_LINE - 1; i++)
fgets(line_buff, sizeof(line_buff), fd);

fgets(line_buff, sizeof(line_buff), fd);
sscanf(line_buff, "%s %d", name, &vmrss);
fclose(fd);

// cnvert VmRSS from KB to MB
return vmrss / 1024.0;
#endif
}

int main()
{
// launch some task to occupy cpu and memory
for (int i = 0; i < 5; i++)
std::thread([]
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}).detach();

int current_pid = GetCurrentPid(); // or you can set a outside program pid
float cpu_usage_ratio = GetCpuUsageRatio(current_pid);
float memory_usage = GetMemoryUsage(current_pid);

while (true)
{
std::cout << "current pid: " << current_pid << std::endl;
std::cout << "cpu usage ratio: " << cpu_usage_ratio * 100 << "%" << std::endl;
std::cout << "memory usage: " << memory_usage << "MB" << std::endl;

std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}

return 0;
}

运行效果

1
2
3
current pid: 1476528
cpu usage ratio: 0%
memory usage: 4.83984MB

资源控制

Windows提供了一个叫做 JobObject的东东,可以限制进程的 CPU利用率CPU核心内存网络带宽管理员权限等等。

使用流程如下:

  1. 通过 CreateJobObjectA创建一个 JobObject
  2. 创建一个job information的结构体,设置自己要控制的值。有很多不同的结构体声明,分别用来控制不同的属性
  3. 通过 SetInformationJobObject来将刚才创建的information设给JobObject
  4. 通过 AssignProcessToJobObject来将某个进程分配给Job Object,这样,这个进程就会受到Job Object的控制了

注意限制子进程不需要管理员权限。如果要限制已经存在的进程,就需要用管理员权限运行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#include <windows.h>
#include <stdexcept>
#include <iostream>
#include <string>
using namespace std::string_literals;

class Process
{
public:
Process() = default;
Process(const Process&) = delete;
Process operator=(const Process&) = delete;
virtual ~Process() {};
virtual HANDLE handle() const = 0;
virtual void wait() const = 0;
};

class AttachProcess : public Process
{
private:
HANDLE h;
public:
explicit AttachProcess(DWORD pid)
{
h = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
if (h == NULL)
throw std::runtime_error("Failed to open process");
}
~AttachProcess() { CloseHandle(h); }
HANDLE handle() const override { return h; }
void wait() const override
{
WaitForSingleObject(h, INFINITE);
}
};

class ChildProcess : public Process
{
private:
STARTUPINFOA si;
PROCESS_INFORMATION pi;
public:
ChildProcess(const ChildProcess&) = delete;
ChildProcess operator=(const ChildProcess&) = delete;
explicit ChildProcess(LPCSTR applicationPath)
{
ZeroMemory(&si, sizeof(si));
si.cb = sizeof(si);
ZeroMemory(&pi, sizeof(pi));
if (!CreateProcessA(applicationPath,
NULL, NULL, NULL, FALSE, CREATE_SUSPENDED, NULL, NULL,
&si, &pi))
{
throw std::runtime_error("Failed to create child process!");
}
}
void start()
{
ResumeThread(pi.hThread);
}
void wait() const override
{
WaitForSingleObject(pi.hProcess, INFINITE);
}
HANDLE handle() const override
{
return pi.hProcess;
}
~ChildProcess()
{
WaitForSingleObject(pi.hProcess, INFINITE);
CloseHandle(pi.hProcess);
CloseHandle(pi.hThread);
}
};

class JobObjectInformation
{
public:
JobObjectInformation() = default;
JobObjectInformation(const JobObjectInformation&) = delete;
JobObjectInformation operator=(const JobObjectInformation&) = delete;
virtual JOBOBJECTINFOCLASS getClass() const = 0;
virtual DWORD length() const = 0;
virtual LPVOID informationPtr() const = 0;
virtual ~JobObjectInformation() {};
};


class JobObject
{
public:
JobObject(const JobObject&) = delete;
JobObject operator=(const JobObject&) = delete;
JobObject()
{
jobHandle = CreateJobObjectA(NULL, NULL);
if (jobHandle == NULL) {
throw std::runtime_error("Create job failed!");
}
}
HANDLE getHandle()
{
return jobHandle;
}
void assignProcess(const Process &p)
{
if (AssignProcessToJobObject(jobHandle, p.handle()) == 0)
throw std::runtime_error("Failed to assgin process to job: "s + std::to_string(GetLastError()));
}
BOOL setInformation(const JobObjectInformation& i)
{
return SetInformationJobObject(jobHandle, i.getClass(), i.informationPtr(), i.length());
}
~JobObject()
{
CloseHandle(jobHandle);
}

private:
HANDLE jobHandle{};
};



class CpuRateJobObjectInformation : public JobObjectInformation
{
public:
CpuRateJobObjectInformation(int rate)
{
information.ControlFlags = JOB_OBJECT_CPU_RATE_CONTROL_ENABLE | JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP;
if (rate <= 0 || rate > 100)
throw std::runtime_error("Invalid argument");
information.CpuRate = rate * 100;
}
virtual JOBOBJECTINFOCLASS getClass() const override
{
return JOBOBJECTINFOCLASS::JobObjectCpuRateControlInformation;
}
virtual DWORD length() const override
{
return sizeof(information);
}
virtual LPVOID informationPtr() const override
{
return (LPVOID) &information;
}
private:
JOBOBJECT_CPU_RATE_CONTROL_INFORMATION information{};
};

int main(int argc, const char **argv)
{
JobObject jobObject;
CpuRateJobObjectInformation information{20};
jobObject.setInformation(information);
// 通过直接打开程序来限制
ChildProcess childProcess{ "c:\\测试demo.exe"};

jobObject.assignProcess(childProcess);
childProcess.start();
childProcess.wait();

// 通过运行程序的PID来限制
DWORD pid;
std::cout << "Input PID:" << std::endl;
std::cin >> pid;
AttachProcess attachProcess{ pid };
try
{
jobObject.assignProcess(attachProcess);
}
catch (const std::exception& e)
{
std::cout << e.what();
}
attachProcess.wait();
return 0;
}

经验

以上的方式都是在程序外部去做限制,对于资源被控程序的内部,就是这部分功能有源码,可以先排查问题,比如是哪一部分CPU占用的资源比较大,可以通过工具去分析,如 VS探查器 ,其他的工具有需要后续再详细说明。

思路说明:

CPU占用过高,一般是发生在循环内部,或则多线程占用了资源。

既然知道了原因,那么就有对应的解决办法:

  1. 优化循环部分的程序,看是否能简化循环步骤或利用硬件特性(比如在一个循环内同时处理多个数据)来解决
  2. sleep(0)的妙用,让出剩余资源控制权限

另一种也是在程序外处理:

  1. 绑定线程到指定的CPU核心

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    //Thread 0 can only run on CPU 0.  

    SetThreadAffinityMask(hThread0, 0x00000001); //第0位是1

    //Threads 1, 2, 3 run on CPUs 1, 2, 3.//第1 2 3位是1

    SetThreadAffinityMask(hThread1, 0x00000002);

    SetThreadAffinityMask(hThread2, 0x00000003);

    SetThreadAffinityMask(hThread3, 0x00000004);
  2. 设置优先级

    具体的内容见参考5

以上就是工程中的一些经验分享,深入的可在交流,对于深度学习模型在CPU上跑满CPU资源的开发们,可以尝试这种方式,避免电脑卡死。

参考

  1. https://zhuanlan.zhihu.com/p/266839249
  2. https://zhuanlan.zhihu.com/p/498134424
  3. https://blog.csdn.net/qiaoquan3/article/details/56281092
  4. https://www.cnblogs.com/kex1n/archive/2011/05/09/2040924.html
  5. https://blog.csdn.net/youshijian99/article/details/79682054