性能#
您可以使用 genai-perf
工具,在模拟生产负载下,对文本重排序 NIM 的性能进行基准测试。genai-perf
已预装在 Triton Server SDK 容器中。
要运行性能基准测试,首先创建文本示例数据集,genai-perf
在向排序服务发出请求时可以使用这些示例。这些示例应代表您期望在生产环境中接收到的数据类型。数据集应格式化为 JSONL 文件,其中每行包含一个 {"text": ...}
对象。您需要按照此格式在同一目录中创建两个文件,queries.jsonl
和 passages.jsonl
。genai-perf
将随机组装查询-段落对,以向服务发出请求。
示例
queries.jsonl
{"text": "What was the first car ever driven?"}
{"text": "Who served as the 5th President of the United States of America?"}
{"text": "Is the Sydney Opera House located in Australia?"}
{"text": "In what state did they film Shrek 2?"}
passages.jsonl
{"text": "Eric Anderson (born January 18, 1968) is an American sociologist and sexologist."}
{"text": "Kevin Loader is a British film and television producer."}
{"text": "Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia."}
{"text": "Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."}
使用以下示例运行 Triton Inference Server SDK Docker 容器,挂载目录,如下例所示为 datasets/
,您在其中创建了 JSONL 文件。
export RELEASE="yy.mm" # e.g. export RELEASE="24.10"
docker run -it --rm \
--gpus=all \
--network="host" \
--mount type=bind,source=${PWD}/datasets,target=/datasets \
nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
执行以下命令,使用 genai-perf
命令行工具运行性能基准测试。
genai-perf profile \
-m nvidia/nv-rerankqa-mistral-4b-v3 \
--service-kind openai \
--endpoint-type rankings \
--batch-size 10 \
--input-file /datasets/ \
--extra-inputs truncate:END \
--concurrency 5 \
--url http://127.0.0.1:8000
您可以在 GenAI-Perf 文档的 命令行选项 部分中查看 genai-perf
的完整命令行选项集。
基准测试#
所有延迟测量均以毫秒为单位报告。
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
33.0 |
31.0 |
37.0 |
37.0 |
307.3 |
512 |
10 |
3 |
62.0 |
63.0 |
72.0 |
74.0 |
474.9 |
512 |
10 |
5 |
103.0 |
104.0 |
113.0 |
116.0 |
471.7 |
512 |
20 |
1 |
57.0 |
59.0 |
64.0 |
65.0 |
351.1 |
512 |
20 |
3 |
124.0 |
123.0 |
139.0 |
140.0 |
475.7 |
512 |
20 |
5 |
206.0 |
207.0 |
227.0 |
230.0 |
477.7 |
512 |
40 |
1 |
99.0 |
99.0 |
109.0 |
110.0 |
402.8 |
512 |
40 |
3 |
244.0 |
248.0 |
261.0 |
267.0 |
483.4 |
512 |
40 |
5 |
405.0 |
414.0 |
429.0 |
434.0 |
483.7 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
40.0 |
39.0 |
43.0 |
44.0 |
250.4 |
512 |
10 |
3 |
88.0 |
89.0 |
96.0 |
104.0 |
337.7 |
512 |
10 |
5 |
145.0 |
148.0 |
153.0 |
154.0 |
337.1 |
512 |
20 |
1 |
71.0 |
74.0 |
78.0 |
79.0 |
280.1 |
512 |
20 |
3 |
171.0 |
172.0 |
184.0 |
187.0 |
345.0 |
512 |
20 |
5 |
284.0 |
288.0 |
305.0 |
310.0 |
345.3 |
512 |
40 |
1 |
129.0 |
129.0 |
139.0 |
140.0 |
309.5 |
512 |
40 |
3 |
341.0 |
346.0 |
358.0 |
361.0 |
347.5 |
512 |
40 |
5 |
565.0 |
575.0 |
592.0 |
601.0 |
347.2 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
58.0 |
58.0 |
61.0 |
61.0 |
170.8 |
512 |
10 |
3 |
149.0 |
150.0 |
159.0 |
204.0 |
197.8 |
512 |
10 |
5 |
246.0 |
250.0 |
257.0 |
260.0 |
198.8 |
512 |
20 |
1 |
119.0 |
117.0 |
123.0 |
124.0 |
168.5 |
512 |
20 |
3 |
315.0 |
319.0 |
325.0 |
326.0 |
187.9 |
512 |
20 |
5 |
523.0 |
532.0 |
540.0 |
540.0 |
187.5 |
512 |
40 |
1 |
234.0 |
234.0 |
242.0 |
243.0 |
171.1 |
512 |
40 |
3 |
652.0 |
661.0 |
670.0 |
674.0 |
181.6 |
512 |
40 |
5 |
1080.0 |
1101.0 |
1118.0 |
1120.0 |
181.4 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
78.0 |
78.0 |
80.0 |
80.0 |
128.2 |
512 |
10 |
3 |
205.0 |
207.0 |
210.0 |
212.0 |
144.8 |
512 |
10 |
5 |
340.0 |
346.0 |
354.0 |
357.0 |
144.0 |
512 |
20 |
1 |
158.0 |
157.0 |
162.0 |
163.0 |
126.8 |
512 |
20 |
3 |
430.0 |
436.0 |
443.0 |
446.0 |
137.2 |
512 |
20 |
5 |
716.0 |
728.0 |
739.0 |
744.0 |
136.9 |
512 |
40 |
1 |
312.0 |
312.0 |
320.0 |
321.0 |
128.0 |
512 |
40 |
3 |
886.0 |
896.0 |
907.0 |
910.0 |
134.0 |
512 |
40 |
5 |
1463.0 |
1492.0 |
1512.0 |
1515.0 |
134.0 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
74.0 |
74.0 |
79.0 |
80.0 |
134.8 |
512 |
10 |
3 |
185.0 |
188.0 |
202.0 |
244.0 |
157.3 |
512 |
10 |
5 |
311.0 |
315.0 |
325.0 |
329.0 |
157.6 |
512 |
20 |
1 |
139.0 |
137.0 |
149.0 |
150.0 |
143.5 |
512 |
20 |
3 |
371.0 |
373.0 |
394.0 |
398.0 |
159.8 |
512 |
20 |
5 |
615.0 |
622.0 |
644.0 |
648.0 |
159.5 |
512 |
40 |
1 |
267.0 |
266.0 |
286.0 |
290.0 |
149.8 |
512 |
40 |
3 |
744.0 |
752.0 |
788.0 |
799.0 |
159.1 |
512 |
40 |
5 |
1231.0 |
1252.0 |
1296.0 |
1316.0 |
159.0 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
79.0 |
79.0 |
84.0 |
85.0 |
126.4 |
512 |
10 |
3 |
203.0 |
206.0 |
217.0 |
271.0 |
144.5 |
512 |
10 |
5 |
339.0 |
346.0 |
354.0 |
358.0 |
144.4 |
512 |
20 |
1 |
151.0 |
151.0 |
160.0 |
161.0 |
132.0 |
512 |
20 |
3 |
405.0 |
411.0 |
428.0 |
435.0 |
145.4 |
512 |
20 |
5 |
672.0 |
685.0 |
710.0 |
717.0 |
145.5 |
512 |
40 |
1 |
289.0 |
287.0 |
307.0 |
310.0 |
138.2 |
512 |
40 |
3 |
811.0 |
817.0 |
842.0 |
851.0 |
146.3 |
512 |
40 |
5 |
1340.0 |
1357.0 |
1405.0 |
1412.0 |
146.2 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
230.0 |
229.0 |
240.0 |
244.0 |
43.5 |
512 |
10 |
3 |
639.0 |
644.0 |
667.0 |
675.0 |
46.5 |
512 |
10 |
5 |
1055.0 |
1076.0 |
1101.0 |
1107.0 |
46.4 |
512 |
20 |
1 |
447.0 |
442.0 |
464.0 |
468.0 |
44.7 |
512 |
20 |
3 |
1257.0 |
1276.0 |
1310.0 |
1320.0 |
46.9 |
512 |
20 |
5 |
2088.0 |
2126.0 |
2172.0 |
2185.0 |
46.9 |
512 |
40 |
1 |
877.0 |
879.0 |
902.0 |
906.0 |
45.6 |
512 |
40 |
3 |
2534.0 |
2566.0 |
2605.0 |
2617.0 |
46.7 |
512 |
40 |
5 |
4194.0 |
4273.0 |
4321.0 |
4339.0 |
46.7 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
190.0 |
187.0 |
197.0 |
199.0 |
52.7 |
512 |
10 |
3 |
534.0 |
543.0 |
554.0 |
557.0 |
55.2 |
512 |
10 |
5 |
888.0 |
907.0 |
924.0 |
925.0 |
55.2 |
512 |
20 |
1 |
383.0 |
384.0 |
395.0 |
397.0 |
52.1 |
512 |
20 |
3 |
1117.0 |
1134.0 |
1150.0 |
1153.0 |
53.0 |
512 |
20 |
5 |
1850.0 |
1885.0 |
1908.0 |
1916.0 |
53.0 |
512 |
40 |
1 |
768.0 |
768.0 |
785.0 |
787.0 |
52.1 |
512 |
40 |
3 |
2254.0 |
2279.0 |
2302.0 |
2308.0 |
52.7 |
512 |
40 |
5 |
3712.0 |
3795.0 |
3834.0 |
3839.0 |
52.7 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
188.0 |
188.0 |
196.0 |
197.0 |
53.2 |
512 |
10 |
3 |
533.0 |
541.0 |
554.0 |
559.0 |
55.4 |
512 |
10 |
5 |
884.0 |
902.0 |
917.0 |
922.0 |
55.4 |
512 |
20 |
1 |
383.0 |
382.0 |
396.0 |
399.0 |
52.2 |
512 |
20 |
3 |
1119.0 |
1134.0 |
1147.0 |
1152.0 |
53.1 |
512 |
20 |
5 |
1848.0 |
1884.0 |
1910.0 |
1916.0 |
53.0 |
512 |
40 |
1 |
767.0 |
767.0 |
784.0 |
788.0 |
52.2 |
512 |
40 |
3 |
2245.0 |
2276.0 |
2301.0 |
2308.0 |
52.8 |
512 |
40 |
5 |
3714.0 |
3790.0 |
3825.0 |
3831.0 |
52.8 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
59.2 |
58.6 |
59.4 |
59.7 |
168.6 |
512 |
20 |
1 |
109.8 |
108.6 |
109.7 |
110.3 |
181.9 |
512 |
40 |
1 |
199.8 |
198.3 |
199.5 |
200.2 |
200.0 |
512 |
10 |
3 |
145.1 |
145.1 |
145.8 |
146.0 |
206.5 |
512 |
20 |
3 |
277.4 |
277.3 |
278.9 |
279.4 |
216.0 |
512 |
40 |
3 |
557.2 |
557.9 |
559.8 |
560.5 |
215.1 |
512 |
10 |
5 |
230.6 |
230.5 |
231.9 |
232.5 |
216.6 |
512 |
20 |
5 |
472.3 |
472.7 |
474.4 |
474.9 |
211.5 |
512 |
40 |
5 |
934.6 |
935.9 |
938.6 |
939.1 |
213.7 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
75.9 |
75.1 |
76.7 |
77.6 |
131.5 |
512 |
20 |
1 |
145.4 |
143.9 |
146.8 |
148.9 |
137.4 |
512 |
40 |
1 |
266.0 |
264.8 |
268.8 |
271.4 |
150.2 |
512 |
10 |
3 |
203.4 |
203.5 |
204.9 |
205.3 |
147.3 |
512 |
20 |
3 |
380.3 |
380.6 |
383.4 |
384.4 |
157.6 |
512 |
40 |
3 |
768.8 |
770.2 |
772.7 |
773.1 |
155.8 |
512 |
10 |
5 |
316.0 |
316.2 |
319.6 |
320.4 |
158.1 |
512 |
20 |
5 |
654.0 |
654.6 |
657.1 |
657.7 |
152.7 |
512 |
40 |
5 |
1291.4 |
1293.7 |
1297.1 |
1297.5 |
154.6 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
145.1 |
145.2 |
148.5 |
149.2 |
68.9 |
512 |
20 |
1 |
297.2 |
295.7 |
298.4 |
299.1 |
67.3 |
512 |
40 |
1 |
585.3 |
586.1 |
591.0 |
591.8 |
68.3 |
512 |
10 |
3 |
425.0 |
425.9 |
434.5 |
435.7 |
70.5 |
512 |
20 |
3 |
857.7 |
860.4 |
870.1 |
871.8 |
69.8 |
512 |
40 |
3 |
1752.4 |
1773.1 |
1787.9 |
1791.5 |
68.2 |
512 |
10 |
5 |
714.2 |
717.6 |
724.4 |
725.5 |
69.9 |
512 |
20 |
5 |
1456.2 |
1461.0 |
1474.4 |
1477.2 |
68.5 |
512 |
40 |
5 |
2909.7 |
2877.8 |
3558.4 |
3564.7 |
68.1 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
220.1 |
219.6 |
222.0 |
224.4 |
45.4 |
512 |
20 |
1 |
459.9 |
459.4 |
463.1 |
463.7 |
43.5 |
512 |
40 |
1 |
918.5 |
923.7 |
932.3 |
934.3 |
43.5 |
512 |
10 |
3 |
664.0 |
667.2 |
677.6 |
678.5 |
45.1 |
512 |
20 |
3 |
1378.7 |
1394.0 |
1409.0 |
1410.9 |
43.4 |
512 |
40 |
3 |
2782.8 |
2803.1 |
2838.5 |
2839.8 |
42.9 |
512 |
10 |
5 |
1138.4 |
1151.4 |
1164.2 |
1165.4 |
43.8 |
512 |
20 |
5 |
2335.1 |
2351.6 |
2374.7 |
2377.9 |
42.7 |
512 |
40 |
5 |
4653.8 |
4701.1 |
4757.4 |
4758.4 |
42.7 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
179.8 |
175.2 |
177.8 |
240.4 |
55.6 |
512 |
20 |
1 |
351.2 |
343.6 |
345.8 |
451.4 |
56.9 |
512 |
40 |
1 |
672.7 |
658.6 |
715.5 |
738.2 |
59.5 |
512 |
10 |
3 |
498.5 |
499.7 |
502.7 |
503.2 |
60.1 |
512 |
20 |
3 |
968.1 |
970.2 |
971.6 |
971.9 |
61.9 |
512 |
40 |
3 |
1952.9 |
1961.7 |
1963.7 |
1964.3 |
61.2 |
512 |
10 |
5 |
804.9 |
806.1 |
808.1 |
808.8 |
62.0 |
512 |
20 |
5 |
1624.4 |
1631.2 |
1633.3 |
1633.8 |
61.3 |
512 |
40 |
5 |
3259.6 |
3277.6 |
3282.3 |
3282.8 |
61.1 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
219.9 |
201.0 |
287.8 |
288.7 |
45.5 |
512 |
20 |
1 |
436.9 |
417.7 |
544.2 |
566.5 |
45.8 |
512 |
40 |
1 |
836.3 |
803.6 |
1010.0 |
1144.6 |
47.8 |
512 |
10 |
3 |
613.9 |
623.1 |
643.7 |
644.4 |
48.8 |
512 |
20 |
3 |
1227.0 |
1248.9 |
1275.5 |
1278.1 |
48.8 |
512 |
40 |
3 |
2451.1 |
2529.2 |
2536.7 |
2538.0 |
48.7 |
512 |
10 |
5 |
1007.0 |
1035.5 |
1039.8 |
1040.8 |
49.6 |
512 |
20 |
5 |
2069.8 |
2088.5 |
2181.8 |
2219.3 |
48.2 |
512 |
40 |
5 |
4099.7 |
4231.8 |
4256.0 |
4288.3 |
48.3 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
591.1 |
586.2 |
590.3 |
594.1 |
16.9 |
512 |
20 |
1 |
1187.8 |
1173.2 |
1175.5 |
1362.1 |
16.8 |
512 |
40 |
1 |
2396.1 |
2346.0 |
2535.7 |
2536.2 |
16.7 |
512 |
10 |
3 |
1711.4 |
1720.3 |
1720.7 |
1720.7 |
17.4 |
512 |
20 |
3 |
3422.8 |
3440.8 |
3441.2 |
3441.3 |
17.4 |
512 |
40 |
3 |
6844.3 |
6880.9 |
6881.5 |
6881.6 |
17.4 |
512 |
10 |
5 |
2837.1 |
2867.2 |
2867.5 |
2867.7 |
17.4 |
512 |
20 |
5 |
5674.2 |
5734.9 |
5735.3 |
5735.6 |
17.2 |
512 |
40 |
5 |
11343.2 |
11467.9 |
11468.5 |
11468.9 |
17.4 |
输入 tokens |
批大小 |
并发 |
平均延迟 |
P50 延迟 |
P90 延迟 |
P95 延迟 |
吞吐量(输入/秒) |
---|---|---|---|---|---|---|---|
512 |
10 |
1 |
705.6 |
676.9 |
916.8 |
921.1 |
14.2 |
512 |
20 |
1 |
1414.1 |
1363.6 |
1589.5 |
1605.7 |
14.1 |
512 |
40 |
1 |
2758.9 |
2719.8 |
2937.8 |
2944.9 |
14.5 |
512 |
10 |
3 |
2004.2 |
2018.6 |
2021.6 |
2022.5 |
14.9 |
512 |
20 |
3 |
4012.6 |
4040.1 |
4045.6 |
4048.0 |
14.7 |
512 |
40 |
3 |
8019.4 |
8075.2 |
8082.2 |
8087.4 |
14.9 |
512 |
10 |
5 |
3321.0 |
3365.0 |
3370.8 |
3372.7 |
14.9 |
512 |
20 |
5 |
6645.1 |
6734.9 |
6741.6 |
6742.8 |
14.9 |
512 |
40 |
5 |
13284.7 |
13461.9 |
13473.1 |
13475.7 |
14.9 |