(window.webpackJsonp=window.webpackJsonp||[]).push([[593],{1805:function(s,t,a){"use strict";a.r(t);var n=a(7),e=Object(n.a)({},(function(){var s=this,t=s._self._c;return t("ContentSlotsDistributor",{attrs:{"slot-key":s.$parent.slotKey}},[t("h1",{attrs:{id:"统一资源调度平台"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#统一资源调度平台"}},[s._v("#")]),s._v(" 统一资源调度平台")]),s._v(" "),t("h2",{attrs:{id:"_01-背景"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_01-背景"}},[s._v("#")]),s._v(" 01.背景")]),s._v(" "),t("h3",{attrs:{id:"_1、概述"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1、概述"}},[s._v("#")]),s._v(" 1、概述")]),s._v(" "),t("ul",[t("li",[s._v("平台希望能将整个 AA 的机器资源统一管理，实现统一调度")]),s._v(" "),t("li",[s._v("将内部硬件资源动态分配，通过Quota等方式进行资源限制")]),s._v(" "),t("li",[s._v("让闲置的资源流动起来，同时要满足不同部门，不同团队的具体使用场景")])]),s._v(" "),t("h3",{attrs:{id:"_2、现状"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2、现状"}},[s._v("#")]),s._v(" 2、现状")]),s._v(" "),t("ul",[t("li",[s._v("目前评测、数据处理等任务的现有服务只是在k8s上直接部署，没有一个系统进行统一调度资源管理")]),s._v(" "),t("li",[s._v("而且有些机器占用后只是特定时间使用，造成资源浪费")]),s._v(" "),t("li",[s._v("同时新加一个服务需要运维审批，然后再自行k8s部署，调度过程并不可追踪且繁琐")])]),s._v(" "),t("h3",{attrs:{id:"_3、目标和非目标"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3、目标和非目标"}},[s._v("#")]),s._v(" 3、目标和非目标")]),s._v(" "),t("p",[s._v("目标")]),s._v(" "),t("ul",[t("li",[s._v("实现对评测、数据处理相关任务所需机器资源在集群上的统一调度，简化当前调度机器资源的流程，减少人力成本，优化资源使用率")]),s._v(" "),t("li",[s._v("P1: 对平台进行优化，拓宽各种任务所需的功能，如支持服务的实例数量的在线扩/缩容等")]),s._v(" "),t("li",[s._v("P2: 将 NTS 训练平台的功能以及用户迁移到该平台")]),s._v(" "),t("li",[s._v("P3:  对整个AA的机器资源进行统一调度管理，支持更多业务的任务机器资源调度")])]),s._v(" "),t("p",[s._v("非目标")]),s._v(" "),t("ul",[t("li",[s._v("不做调度完机器资源之后具体业务上的调度\n"),t("ul",[t("li",[s._v("平台单次资源调度在 10s 级别，即资源充足情况下，从获取到服务所需资源到服务启动消耗的时间")])])]),s._v(" "),t("li",[s._v("不做具体业务的上线、维护")]),s._v(" "),t("li",[s._v("不提供有状态服务或任务的调度，如 mysql 数据库等")])]),s._v(" "),t("h2",{attrs:{id:"_02-任务类型"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_02-任务类型"}},[s._v("#")]),s._v(" 02.任务类型")]),s._v(" "),t("p",[s._v("平台可调度的任务主要分为 MPI Job / Spark Job 和 Service")]),s._v(" "),t("h3",{attrs:{id:"_1、mpi-job"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1、mpi-job"}},[s._v("#")]),s._v(" 1、MPI job")]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("定义")]),s._v("：即须运行"),t("code",[s._v("一段时间只执行一次的 MPI 任务，任务有完整的生命周期")]),s._v("（如训练任务(MPI)、Datafilter）")]),s._v(" "),t("li",[t("strong",[s._v("平台职责")]),s._v("：\n"),t("ul",[t("li",[s._v("调度平台负责"),t("code",[s._v("将 mpi job 调度到合适的节点上执行")]),s._v("，并对 job 的生命周期进行监控，job 执行结束后立即释放资源")]),s._v(" "),t("li",[s._v("同时平台支持调度多个节点用于多机运算")])])]),s._v(" "),t("li",[t("strong",[s._v("存储方案")]),s._v("：\n"),t("ul",[t("li",[s._v("为用户提供一种可持久化的网络存储，支持用户申请后，在 job 启动时挂载到 job 上")]),s._v(" "),t("li",[s._v("同时也为每个 job 提供一个生命周期与 job 相同的临时存储")])])]),s._v(" "),t("li",[t("strong",[s._v("使用形式")]),s._v("：\n"),t("ul",[t("li",[s._v("用户通过将环境打成 image，在指定 image 下运行指定 command")])])]),s._v(" "),t("li",[t("strong",[s._v("调度策略")]),s._v("：\n"),t("ul",[t("li",[s._v("根据任务的资源需求以及集群资源空余情况，调度 job 到合适的机器")]),s._v(" "),t("li",[s._v("对于多节点任务，会在确保多个节点资源都满足的情况下一起调度")])])])]),s._v(" "),t("h3",{attrs:{id:"_2、spark-job"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2、spark-job"}},[s._v("#")]),s._v(" 2、Spark Job")]),s._v(" "),t("h4",{attrs:{id:"_1-spark-概述"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-spark-概述"}},[s._v("#")]),s._v(" 1）Spark 概述")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("Spark 任务定义")]),s._v(" "),t("ul",[t("li",[s._v("Spark 是一个 "),t("strong",[s._v("分布式计算框架")]),s._v("，用于处理大规模数据")]),s._v(" "),t("li",[s._v("Spark Job "),t("code",[s._v("自动切分数据")]),s._v(" 并 "),t("code",[s._v("智能调度计算")])]),s._v(" "),t("li",[s._v("相比 K8s Job，无需手动拆分数据，还能 动态调整资源")])])]),s._v(" "),t("li",[t("p",[s._v("例")]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("自动划分 100GB LiDAR 数据")]),s._v("，每个 Executor 只处理一部分数据")]),s._v(" "),t("li",[t("strong",[s._v("Driver 统一调度")]),s._v("，根据集群资源动态调整 Executor 数量（如 5 个 Executor）")]),s._v(" "),t("li",[t("strong",[s._v("计算完成后自动聚合结果")]),s._v("，无需手动管理数据拆分和合并")])])]),s._v(" "),t("li",[t("p",[s._v("Spark Job 的作用（主要用于 处理大规模数据集）")]),s._v(" "),t("ul",[t("li",[t("p",[t("strong",[s._v("批处理计算")]),s._v("：用于定期处理历史数据，如日志分析、数据统计等")])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("流式计算")]),s._v("：处理实时数据流，如异常检测、事件分析等")])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("数据转换")]),s._v("：从多个数据源提取数据，进行转换和清洗，再存入数据仓库或数据库")])])])]),s._v(" "),t("li",[t("p",[s._v("应用场景: 清洗 LiDAR 点云数据")]),s._v(" "),t("ul",[t("li",[t("p",[t("strong",[s._v("Spark 读取 100GB 点云数据")]),s._v("，自动分配多个计算节点并行处理")])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("去除无效点")]),s._v("（如反射过弱的噪声点）")])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("转换标准格式")]),s._v("，方便下游算法使用")])])])])]),s._v(" "),t("h4",{attrs:{id:"_2-普通k8s-job"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-普通k8s-job"}},[s._v("#")]),s._v(" 2）普通K8s Job")]),s._v(" "),t("blockquote",[t("p",[s._v("如果用 "),t("code",[s._v("K8s Job 直接处理 100GB LiDAR 数据")]),s._v("，可能"),t("code",[s._v("需要手动拆分数据")])])]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("手动切分数据")]),s._v("，让每个 Pod 只处理一部分")]),s._v(" "),t("li",[s._v("K8s 会启动 5 个 Pod，每个 Pod 负责不同的 "),t("code",[s._v("lidar_part_{index}.bin")]),s._v(" 数据")]),s._v(" "),t("li",[s._v("缺点：\n"),t("ul",[t("li",[t("code",[s._v("数据划分需手动管理")]),s._v("，"),t("code",[s._v("资源固定，无法动态扩展")])])])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" lidar"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("processing\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("completions")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("5")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 5个并行任务")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("parallelism")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("5")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" lidar"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("processor\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" myrepo/lidar"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("v1\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("args")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("[")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"--input"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"s3://dataset/lidar_part_{index}.bin"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("]")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br")])]),t("h4",{attrs:{id:"_3-spark-job"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3-spark-job"}},[s._v("#")]),s._v(" 3）Spark Job")]),s._v(" "),t("blockquote",[t("p",[s._v("如果用 "),t("strong",[s._v("Spark Job 处理 100GB LiDAR 数据")]),s._v("，流程完全不同")])]),s._v(" "),t("ul",[t("li",[t("p",[t("strong",[s._v("K8s 启动 Spark Driver")]),s._v("，Driver 发现数据大，向 K8s 申请 "),t("strong",[s._v("5 个 Executor")])])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("Executor 由 Spark 负责调度")]),s._v("，K8s 只是分配 Pod 资源，不关心具体计算任务")])]),s._v(" "),t("li",[t("p",[s._v("Spark 会"),t("strong",[s._v("自动分割数据")]),s._v("，并均匀分配给 5 个 Executor（无需手动指定每个 Executor 处理哪部分数据）")])]),s._v(" "),t("li",[t("p",[s._v("任务完成后，Driver 触发"),t("strong",[s._v("结果聚合")]),s._v("，并存储到 HDFS/S3")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("① 构建 Image")])])])]),s._v(" "),t("div",{staticClass:"language-dockerfile line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-dockerfile"}},[t("code",[t("span",{pre:!0,attrs:{class:"token instruction"}},[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("FROM")]),s._v(" apache/spark:latest")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token instruction"}},[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("RUN")]),s._v(" pip install opencv-python numpy pyspark")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token instruction"}},[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("COPY")]),s._v(" camera_processing.py /app/")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token instruction"}},[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("ENTRYPOINT")]),s._v(" ["),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"spark-submit"')]),s._v(", "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"--master"')]),s._v(", "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"k8s://kubernetes.api.server"')]),s._v(", "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"/app/camera_processing.py"')]),s._v("]")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br")])]),t("ul",[t("li",[t("code",[s._v("② 任务提交")]),s._v("（K8s YAML 示例）")])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" sparkoperator.k8s.io/v1beta2\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" SparkApplication\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" camera"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("processing\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("type")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Python\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" myrepo/spark"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("camera"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("v1\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("mainApplicationFile")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" /app/camera_processing.py\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("sparkConf")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.executor.instances")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"5"')]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.executor.memory")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"8g"')]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.executor.cores")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"4"')]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("driver")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("cores")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("2")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("memory")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"4g"')]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("executor")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("cores")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("4")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("memory")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"8g"')]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br"),t("span",{staticClass:"line-number"},[s._v("16")]),t("br"),t("span",{staticClass:"line-number"},[s._v("17")]),t("br"),t("span",{staticClass:"line-number"},[s._v("18")]),t("br")])]),t("ul",[t("li",[t("p",[t("code",[s._v("③ 任务执行流程")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("K8s 调度 "),t("strong",[s._v("Spark Driver Pod")]),s._v("，运行 "),t("code",[s._v("camera_processing.py")])])]),s._v(" "),t("li",[t("p",[s._v("Driver 启动 "),t("strong",[s._v("5 个 Executor")]),s._v("，每个 Executor 处理不同的视频段")])]),s._v(" "),t("li",[t("p",[s._v("任务结束后，自动清理所有 Pod，释放资源")])])])])]),s._v(" "),t("h4",{attrs:{id:"_4-平台调度职责"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_4-平台调度职责"}},[s._v("#")]),s._v(" 4）平台调度职责")]),s._v(" "),t("ul",[t("li",[t("p",[t("strong",[s._v("基于资源需求选择最佳节点")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务提交时，调度器检查 "),t("strong",[s._v("CPU/内存/GPU 需求")])])]),s._v(" "),t("li",[t("p",[s._v("选择最合适的 Worker 节点运行 "),t("strong",[s._v("Spark Driver")])])]),s._v(" "),t("li",[t("p",[s._v("Executor 的调度由 Spark 自身 "),t("strong",[s._v("动态分配")]),s._v("（Spark on K8s 会自动请求 K8s 分配 Executor Pod）")])])])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("动态调整 Executor 数量")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务启动时，默认申请 5 个 Executor")])]),s._v(" "),t("li",[t("p",[s._v("发现数据量大，Spark 自动向 K8s 申请更多 Executor（最多扩展到 20 个）")])]),s._v(" "),t("li",[t("p",[s._v("任务快结束时，Spark 释放多余 Executor，只保留核心计算资源")])]),s._v(" "),t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("sparkConf")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.dynamicAllocation.enabled")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"true"')]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.dynamicAllocation.initialExecutors")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"5"')]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.dynamicAllocation.minExecutors")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"2"')]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spark.dynamicAllocation.maxExecutors")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"20"')]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("调度服务指定调度资源")]),s._v(" "),t("ul",[t("li",[s._v("K8s 集群、Namespace")]),s._v(" "),t("li",[t("code",[s._v("① nodeSelector")]),s._v(" "),t("ul",[t("li",[s._v("在 YAML 里添加 "),t("code",[s._v("nodeSelector")]),s._v("，让 K8s 选择合适的计算节点")]),s._v(" "),t("li",[s._v("适用场景：需要 GPU 加速时，可以限定 Job 只跑在带 GPU 的机器上")])])]),s._v(" "),t("li",[t("code",[s._v("② 打污点")]),s._v(" "),t("ul",[t("li",[s._v("让 Spark Job 运行在 "),t("code",[s._v("带 Taint 限制的特殊节点")]),s._v("（如高性能计算节点）")])])]),s._v(" "),t("li",[t("code",[s._v("③ 使用 Affinity 控制调度")]),s._v(" "),t("ul",[t("li",[s._v("让 Spark Job 只运行在 "),t("strong",[s._v("特定云厂商的节点")]),s._v("（如 AWS/GCP）")])])])])])]),s._v(" "),t("h4",{attrs:{id:"_5-spark-job-原理"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_5-spark-job-原理"}},[s._v("#")]),s._v(" 5）Spark Job 原理")]),s._v(" "),t("ul",[t("li",[t("p",[t("code",[s._v("① 数据分区")])]),s._v(" "),t("ul",[t("li",[s._v("任务开始时，Driver 读取数据源（如 S3/HDFS）")]),s._v(" "),t("li",[s._v("数据被切分为多个 Partition，每个 Partition 由一个 Executor 负责处理")]),s._v(" "),t("li",[s._v("如果 100GB LiDAR 数据存储在 S3（默认 block 128MB）\n"),t("ul",[t("li",[s._v("则 Spark 可能会创建 100GB / 128MB ≈ 800 个 Partition")]),s._v(" "),t("li",[s._v("这些 Partition 会均匀分配给多个 Executor")])])])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("② 数据如何分布式处理")])]),s._v(" "),t("ul",[t("li",[t("p",[t("code",[s._v("spark.read.parquet")]),s._v(" 自动将数据切分为 Partition 并分发给 Executor")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("df.filter()")]),s._v(" 和 "),t("code",[s._v("df.select()")]),s._v(" 在 Executor 内部 并行执行")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("df.write.parquet")]),s._v(" 触发 Shuffle 阶段（数据聚合）")])]),s._v(" "),t("li",[t("div",{staticClass:"language-python line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-python"}},[t("code",[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("from")]),s._v(" pyspark"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("sql "),t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("import")]),s._v(" SparkSession\n\nspark "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" SparkSession"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("builder"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("appName"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"LiDARProcessing"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("getOrCreate"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n\n"),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# Step 1: 读取数据（Spark 自动切分 Partition）")]),s._v("\ndf "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" spark"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("read"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("parquet"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"s3://dataset/lidar_data.parquet"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n\n"),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# Step 2: 数据处理（每个 Partition 并行处理）")]),s._v("\ncleaned_df "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" df"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),t("span",{pre:!0,attrs:{class:"token builtin"}},[s._v("filter")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),s._v("df"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("[")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"intensity"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("]")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v(">")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("0.1")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("select"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"x"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"y"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"z"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n\n"),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# Step 3: 保存处理后的数据（Spark 自动聚合）")]),s._v("\ncleaned_df"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("write"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("parquet"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"s3://output/processed_lidar.parquet"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("③ 计算完成后，如何聚合数据")])]),s._v(" "),t("ul",[t("li",[t("p",[t("code",[s._v("窄依赖")])]),s._v(" "),t("ul",[t("li",[s._v("如果 Task 之间没有数据交换，数据可以在本地 Executor 内部完成计算（无需网络传输）")]),s._v(" "),t("li",[s._v("例如 "),t("code",[s._v("filter()")]),s._v(" 仅处理当前 Partition 的数据，不需要跨 Executor 交互")])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("宽依赖")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("例如 "),t("code",[s._v("groupBy()")]),s._v("、"),t("code",[s._v("reduceByKey()")]),s._v(" 需要跨 Partition 进行数据合并")])]),s._v(" "),t("li",[t("p",[s._v("Spark 需要 重新分区（Shuffle），将相同 key 的数据汇总到同一个 Executor 进行最终计算")])])])])])])]),s._v(" "),t("h3",{attrs:{id:"_3、service"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3、service"}},[s._v("#")]),s._v(" 3、Service")]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("定义")]),s._v("：即需要长期保持运行，执行日常任务的无状态的服务")]),s._v(" "),t("li",[t("strong",[s._v("平台职责")]),s._v("：\n"),t("ul",[t("li",[s._v("平台负责维持用户定义数量的服务实例，实例挂掉后自动重启，保证服务的稳定运行")]),s._v(" "),t("li",[s._v("用户能够动态扩缩服务实例的数量")])])]),s._v(" "),t("li",[t("strong",[s._v("例子")]),s._v("：目前训练系统所使用的 WebTerminal、Web 后端等")]),s._v(" "),t("li",[t("strong",[s._v("存储方案")]),s._v("：持久化存储形式同 mpi job")]),s._v(" "),t("li",[t("strong",[s._v("调度策略")]),s._v("：只要能满足运行服务的最小需求，即"),t("code",[s._v("minAvailable")]),s._v("即可调度")]),s._v(" "),t("li",[s._v("相关组件：用户可以创建和管理一些相关组件，然后在启动 service 时选择对应的组件\n"),t("ul",[t("li",[s._v("config：service 所需的配置文件")]),s._v(" "),t("li",[s._v("secret：包含少量敏感信息例如密码、令牌或密钥的对象")])])])]),s._v(" "),t("h3",{attrs:{id:"_4、gpu任务调度差异"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_4、gpu任务调度差异"}},[s._v("#")]),s._v(" 4、GPU任务调度差异")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("从调度系统的角度来看，"),t("strong",[s._v("CPU 任务和 GPU 任务的调度流程基本相同")])])]),s._v(" "),t("li",[t("p",[s._v("都是通过 YAML 指定 "),t("strong",[s._v("集群（Cluster）、命名空间（Namespace）、资源需求（CPU/GPU/内存）、污点/亲和性等约束条件")])])]),s._v(" "),t("li",[t("p",[s._v("但是，"),t("strong",[s._v("GPU 任务调度相比 CPU 任务，需要额外考虑一些因素")]),s._v("，以确保任务能够正确、高效地运行")])])]),s._v(" "),t("h4",{attrs:{id:"_1-资源隔离"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-资源隔离"}},[s._v("#")]),s._v(" 1）资源隔离")]),s._v(" "),t("ul",[t("li",[t("p",[t("strong",[s._v("CPU 任务")]),s._v(" 只需要 "),t("strong",[s._v("CPU/内存")]),s._v("，可以随意调度到任何有空闲 CPU 的节点上")])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("GPU 任务")]),s._v(" 需要 "),t("strong",[s._v("GPU 资源（如 NVIDIA A100, V100）")])])]),s._v(" "),t("li",[t("p",[s._v("并且 "),t("strong",[s._v("必须确保 GPU 资源不会被多个 Pod 共享")]),s._v("（即 GPU 不能像 CPU 那样在多个任务间动态分配）")])]),s._v(" "),t("li",[t("p",[s._v("① 使用 "),t("code",[s._v("nvidia.com/gpu")]),s._v(" 资源请求，让 K8s 确保每个 GPU Pod 绑定到特定 GPU")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("resources")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("limits")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("nvidia.com/gpu")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("1")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 申请 1 张 GPU")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("② 启用 "),t("code",[s._v("device-plugin")]),s._v("，让 K8s 正确分配 GPU")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language- line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-text"}},[t("code",[s._v("kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("③ 使用 "),t("code",[s._v("nodeSelector")]),s._v(" 或 "),t("code",[s._v("taints")]),s._v(" 确保 GPU 任务不会被调度到 没有 GPU 的普通节点")])])]),s._v(" "),t("h4",{attrs:{id:"_2-亲和性与反亲和性"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-亲和性与反亲和性"}},[s._v("#")]),s._v(" 2）亲和性与反亲和性")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("某些任务一定要跑在特定类型的节点上（亲和性）")])]),s._v(" "),t("li",[t("p",[s._v("某些任务不能和其他任务跑在一起（反亲和性）")])]),s._v(" "),t("li",[t("p",[s._v("① 任务必须运行在某些节点上（如 GPU 任务只能跑在 GPU 服务器上）")])]),s._v(" "),t("li",[t("p",[s._v("② 任务避免调度到相同的物理机上（提高可靠性）")])])]),s._v(" "),t("h4",{attrs:{id:"_3-数据本地性"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3-数据本地性"}},[s._v("#")]),s._v(" 3）数据本地性")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("对于大规模数据处理（如 Spark 任务），数据存储在 HDFS、S3、CephFS 这类分布式存储，任务需要从远程存储读取数据")])]),s._v(" "),t("li",[t("p",[s._v("如果调度不当，可能会导致：")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务调度到数据存储位置很远的节点，导致 I/O 开销大")])]),s._v(" "),t("li",[t("p",[s._v("同一个任务的 Executor 可能被分布到不同的数据中心，影响性能")])])])]),s._v(" "),t("li",[t("p",[s._v("① 使用 "),t("code",[s._v("nodeAffinity")]),s._v(" 让任务优先调度到存储节点附近")])]),s._v(" "),t("li",[t("p",[s._v("② 使用 Spark "),t("code",[s._v("spark.locality.wait")]),s._v(" 参数，减少远程数据访问开销")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[s._v("spark.locality.wait=10ms  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 让 Spark 尽量等一小段时间，看是否能调度到数据本地节点")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("③ 如果数据在 S3 等远程存储，可以提前缓存数据到本地存储（如 NVMe 盘）")])])]),s._v(" "),t("h4",{attrs:{id:"_4-任务隔离"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_4-任务隔离"}},[s._v("#")]),s._v(" 4）任务隔离")]),s._v(" "),t("p",[s._v("在 K8s 里，不同团队可能会共享 GPU 资源，但我们需要")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("确保 GPU 任务不会影响 CPU 任务")])]),s._v(" "),t("li",[t("p",[s._v("同一个团队的 GPU 任务可以共享资源，但不同团队的任务不能抢占资源")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("① 使用 K8s ResourceQuota 限制每个团队的 GPU 资源")])]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" ResourceQuota\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" gpu"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("quota\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("namespace")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" team"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("a\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("hard")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("nvidia.com/gpu")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"4"')]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 该团队最多使用 4 块 GPU")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("② 使用 "),t("code",[s._v("PriorityClass")]),s._v(" 让重要任务优先调度")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" scheduling.k8s.io/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" PriorityClass\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" high"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("value")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("1000")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 优先级越高，调度时越靠前")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br")])])])])])]),s._v(" "),t("h2",{attrs:{id:"_03-存储方案"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_03-存储方案"}},[s._v("#")]),s._v(" 03.存储方案")]),s._v(" "),t("h3",{attrs:{id:"_1、tb数据-长时间任务"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1、tb数据-长时间任务"}},[s._v("#")]),s._v(" 1、TB数据 长时间任务")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("对 Ceph RBD CSI 进行集成，用户可以申请一些“硬盘”，在申请资源时可以选择挂载，存储内容持久化")])]),s._v(" "),t("li",[t("p",[s._v("大规模深度学习训练（如自动驾驶模型训练）")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("训练任务往往 "),t("code",[s._v("持续数小时到数天")]),s._v("，需要 "),t("code",[s._v("大数据集（TB级别） 作为输入")])])]),s._v(" "),t("li",[t("p",[s._v("训练过程中会"),t("code",[s._v("不断生成 模型 checkpoint（断点续训）、日志、训练中间结果")])])]),s._v(" "),t("li",[t("p",[s._v("需要 高吞吐、低延迟存储，并且存储内容 "),t("code",[s._v("训练结束后仍然需要保留")]),s._v("（如模型参数）")])])])]),s._v(" "),t("li",[t("p",[s._v("解决方案")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("使用 "),t("code",[s._v("Ceph RBD CSI")]),s._v(" 进行持久化存储，挂载到训练任务的 Pod")])]),s._v(" "),t("li",[t("p",[s._v("在任务调度时，自动检查是否有 Ceph RBD 挂载请求，并绑定到合适的节点")])])])]),s._v(" "),t("li",[t("p",[s._v("具体 YAML 配置")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务提交时，调度系统检查是否需要挂载 Ceph RBD 持久化存储")])]),s._v(" "),t("li",[t("p",[s._v("如果任务需要 GPU 计算，调度系统会选择 GPU 计算节点，并确保 Ceph RBD 可以挂载到该节点")])]),s._v(" "),t("li",[t("p",[s._v("如果 Ceph  RBD 已被其他任务占用，调度系统会等待，或者调度到其他空闲节点")])]),s._v(" "),t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" PersistentVolumeClaim\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" model"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("training"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("storage\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("accessModes")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" ReadWriteOnce\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("resources")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("requests")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("storage")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" 500Gi  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 申请 500GB 存储")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("storageClassName")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" ceph"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("rbd  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 指定 Ceph RBD 存储")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br")])])])])])]),s._v(" "),t("h3",{attrs:{id:"_2、短时任务-临时存储"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2、短时任务-临时存储"}},[s._v("#")]),s._v(" 2、短时任务 临时存储")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("需求")]),s._v(" "),t("ul",[t("li",[t("p",[t("code",[s._v("数据清洗、数据增强、格式转换")]),s._v(" 任务只运行"),t("code",[s._v("几分钟到几小时")])])]),s._v(" "),t("li",[t("p",[s._v("需要 "),t("code",[s._v("高性能磁盘")]),s._v("，但 "),t("code",[s._v("数据可以临时存储，任务完成后可释放")])])]),s._v(" "),t("li",[t("p",[s._v("计算过程中可能会产生 "),t("code",[s._v("临时缓存（如解压文件、转换后的格式）")]),s._v("，不需要持久化存储")])])])]),s._v(" "),t("li",[t("p",[s._v("解决方案")]),s._v(" "),t("ul",[t("li",[s._v("使用 "),t("code",[s._v("Local Path CSI")]),s._v(" 作为临时存储，每个物理机上的 NVMe 或 SSD 盘提供存储")]),s._v(" "),t("li",[s._v("如果本地存储资源不足，调度系统需要考虑负载均衡，避免某些节点磁盘使用率过高")])])]),s._v(" "),t("li",[t("p",[s._v("具体 YAML 配置")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" PersistentVolumeClaim\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" data"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("processing"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("storage\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("accessModes")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" ReadWriteOnce\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("resources")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("requests")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("storage")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" 100Gi  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 申请 100GB 临时存储")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("storageClassName")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" local"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("path  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 使用 Local Path CSI")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br")])])])])])]),s._v(" "),t("h3",{attrs:{id:"_3、-redis-fuse-挂载"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3、-redis-fuse-挂载"}},[s._v("#")]),s._v(" 3、 Redis-FUSE 挂载")]),s._v(" "),t("table",[t("thead",[t("tr",[t("th",[t("strong",[s._v("方案")])]),s._v(" "),t("th",[t("strong",[s._v("特点")])]),s._v(" "),t("th",[t("strong",[s._v("适用场景")])])])]),s._v(" "),t("tbody",[t("tr",[t("td",[t("strong",[s._v("业务代码主动访问 Redis")])]),s._v(" "),t("td",[s._v("需要使用 "),t("code",[s._v("redis-py")]),s._v(" SDK")]),s._v(" "),t("td",[s._v("适用于 "),t("strong",[s._v("存储结构化数据")]),s._v("（string、hash、list）")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("Redis-FUSE 挂载")])]),s._v(" "),t("td",[s._v("直接 "),t("code",[s._v('open("/mnt/cache/file")')]),s._v(" 访问")]),s._v(" "),t("td",[s._v("适用于 "),t("strong",[s._v("存储大文件")]),s._v("（LiDAR 点云、视频帧）")])])])]),s._v(" "),t("ul",[t("li",[s._v("业务代码希望直接 "),t("code",[s._v('open("/mnt/cache/data.bin")')]),s._v(" 访问 Redis，而不用 SDK")]),s._v(" "),t("li",[s._v("Redis 主要存储 "),t("code",[s._v("二进制文件或大数据块")]),s._v("，如 "),t("code",[s._v("LiDAR 点云、图片、视频帧")])]),s._v(" "),t("li",[s._v("需要 "),t("code",[s._v("高吞吐、低延迟访问")])])]),s._v(" "),t("h4",{attrs:{id:"_1-k8s-任务-yaml-配置"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-k8s-任务-yaml-配置"}},[s._v("#")]),s._v(" 1）K8s 任务 YAML 配置")]),s._v(" "),t("ul",[t("li",[t("p",[t("code",[s._v("Pod 启动时，自动挂载 Redis-FUSE")]),s._v("，"),t("code",[s._v("/mnt/cache")]),s._v(" 变成 Redis 的“本地目录”")]),s._v(" "),t("p",[s._v("业务代码直接 "),t("code",[s._v("open()")]),s._v(" 访问 Redis 数据，无需修改")])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" redis"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("fuse"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n        "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("image\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("command")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("[")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"/bin/bash"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"-c"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("]")]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("args")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"redisfs mount redis://redis-server:6379 /mnt/cache && python inference.py"')]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("volumeMounts")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("mountPath")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" /mnt/cache\n              "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("cache\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("volumes")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n        "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("cache\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("emptyDir")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("{")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("}")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("restartPolicy")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Never\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br"),t("span",{staticClass:"line-number"},[s._v("16")]),t("br"),t("span",{staticClass:"line-number"},[s._v("17")]),t("br"),t("span",{staticClass:"line-number"},[s._v("18")]),t("br"),t("span",{staticClass:"line-number"},[s._v("19")]),t("br"),t("span",{staticClass:"line-number"},[s._v("20")]),t("br")])]),t("h4",{attrs:{id:"_2-业务脚本-inference-py"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-业务脚本-inference-py"}},[s._v("#")]),s._v(" 2）业务脚本 "),t("code",[s._v("inference.py")])]),s._v(" "),t("div",{staticClass:"language-python line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-python"}},[t("code",[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("with")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token builtin"}},[s._v("open")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"/mnt/cache/lidar_frame_001.bin"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"rb"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("as")]),s._v(" f"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    data "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" f"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("read"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n\n"),t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("print")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string-interpolation"}},[t("span",{pre:!0,attrs:{class:"token string"}},[s._v('f"Loaded ')]),t("span",{pre:!0,attrs:{class:"token interpolation"}},[t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("{")]),t("span",{pre:!0,attrs:{class:"token builtin"}},[s._v("len")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),s._v("data"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("}")])]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v(' bytes from Redis-backed cache"')])]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br")])]),t("h3",{attrs:{id:"_4、脚本主动访问-redis"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_4、脚本主动访问-redis"}},[s._v("#")]),s._v(" 4、脚本主动访问 Redis")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("需求（如自动驾驶感知计算）")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("低延迟，高吞吐")])]),s._v(" "),t("li",[t("p",[s._v("需要"),t("code",[s._v("缓存最近一批推理数据")]),s._v("，"),t("code",[s._v("避免重复读取远程存储")]),s._v("（如 S3、HDFS）")])]),s._v(" "),t("li",[t("p",[s._v("需要 "),t("code",[s._v("存储系统提供 Python SDK 访问能力")]),s._v("（如 TensorFlow 训练数据加载）")])])])])]),s._v(" "),t("h4",{attrs:{id:"_1-k8s-任务-yaml-配置-2"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-k8s-任务-yaml-配置-2"}},[s._v("#")]),s._v(" 1）K8s 任务 YAML 配置")]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("环境变量 "),t("code",[s._v("REDIS_HOST")]),s._v(" & "),t("code",[s._v("REDIS_PORT")])]),s._v(" → 脚本可以动态连接 Redis")]),s._v(" "),t("li",[t("strong",[s._v("Pod 运行 "),t("code",[s._v("inference.py")]),s._v(" 业务脚本")]),s._v("，从 Redis 拉取数据并处理")])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" redis"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("inference"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n        "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" inference"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("image\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("command")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("[")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"/bin/bash"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"-c"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("]")]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("args")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"python inference.py"')]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("env")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" REDIS_HOST\n              "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("value")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"redis-server"')]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" REDIS_PORT\n              "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("value")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"6379"')]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("restartPolicy")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Never\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br"),t("span",{staticClass:"line-number"},[s._v("16")]),t("br"),t("span",{staticClass:"line-number"},[s._v("17")]),t("br"),t("span",{staticClass:"line-number"},[s._v("18")]),t("br"),t("span",{staticClass:"line-number"},[s._v("19")]),t("br")])]),t("h4",{attrs:{id:"_2-业务脚本-inference-py-2"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-业务脚本-inference-py-2"}},[s._v("#")]),s._v(" 2）业务脚本 "),t("code",[s._v("inference.py")])]),s._v(" "),t("ul",[t("li",[s._v("自动判断 Redis 数据类型（"),t("code",[s._v("r.type(key)")]),s._v("）")]),s._v(" "),t("li",[s._v("针对不同类型（string、hash、list、set、zset）调用不同的 API")]),s._v(" "),t("li",[s._v("避免硬编码 key 的类型，提高灵活性")])]),s._v(" "),t("div",{staticClass:"language-py line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-py"}},[t("code",[t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("import")]),s._v(" os\n"),t("span",{pre:!0,attrs:{class:"token keyword"}},[s._v("import")]),s._v(" redis\n\n"),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 连接 Redis")]),s._v("\nredis_host "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" os"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("getenv"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"REDIS_HOST"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"localhost"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\nredis_port "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token builtin"}},[s._v("int")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),s._v("os"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("getenv"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"REDIS_PORT"')]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("6379")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\nr "),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v(" redis"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(".")]),s._v("Redis"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("(")]),s._v("host"),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v("redis_host"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" port"),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),s._v("redis_port"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(",")]),s._v(" decode_responses"),t("span",{pre:!0,attrs:{class:"token operator"}},[s._v("=")]),t("span",{pre:!0,attrs:{class:"token boolean"}},[s._v("True")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(")")]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br")])]),t("h4",{attrs:{id:"_3-调度系优化点"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3-调度系优化点"}},[s._v("#")]),s._v(" 3）调度系优化点")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("假设我们有 "),t("code",[s._v("3 个 K8s 集群")]),s._v("，每个集群都可能运行 Redis 缓存，同时任务需要访问激光雷达数据 ("),t("code",[s._v("frame_001.bin")]),s._v(")")])]),s._v(" "),t("li",[t("p",[s._v("🔹 任务提交前，调度系统的决策流程")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("① 任务提交")]),s._v("：请求处理 frame_001")]),s._v(" "),t("ul",[t("li",[s._v("任务申请 2 个 GPU，10GB 内存")])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("② 调度系统检查缓存情况")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("发现 "),t("code",[s._v("frame_001")]),s._v(" 已经缓存在 A 集群的 Redis 上")])]),s._v(" "),t("li",[t("p",[s._v("B 和 C 没有缓存该数据，但 C 空闲 GPU 资源最多")])])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("③ 智能调度决策")])]),s._v(" "),t("ul",[t("li",[t("p",[s._v("如果 A 集群有空闲 GPU，任务优先调度到 A 集群，直接从 Redis 读取数据（最快）")])]),s._v(" "),t("li",[t("p",[s._v("如果 A 集群 GPU 资源不足")]),s._v(" "),t("ul",[t("li",[s._v("检查 A 集群是否有本地 emptyDir 缓存（如 "),t("code",[s._v("/mnt/cache")]),s._v("）")]),s._v(" "),t("li",[s._v("如果 A 也没有缓存，才调度到 C，并触发缓存预热机制（将 "),t("code",[s._v("frame_001")]),s._v(" 拉取到 C 的 Redis）")])])])])])]),s._v(" "),t("h2",{attrs:{id:"_04-技术架构"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_04-技术架构"}},[s._v("#")]),s._v(" 04.技术架构")]),s._v(" "),t("h3",{attrs:{id:"_1、资源管理"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1、资源管理"}},[s._v("#")]),s._v(" 1、资源管理")]),s._v(" "),t("ul",[t("li",[s._v("资源划分：将集群资源划分为 Public 资源池和 Project 资源池\n"),t("ul",[t("li",[s._v("Public 资源为整个部门共同使用的资源，大部分资源都在 public 资源中，比如提供给算法训练的机器资源")]),s._v(" "),t("li",[s._v("Project 资源为某个项目或业务的保留资源，也可以是组内特殊用途的机器")])])]),s._v(" "),t("li",[s._v("Public 资源 Quota 限制：\n"),t("ul",[t("li",[s._v("用户可以选择以个人名义或 project 名义申请 public 资源，个人和 project 都有一定限额的 quota 限制")]),s._v(" "),t("li",[s._v("分发到个人和 project 的总 quota 会大于 public 资源数量，来提高集群利用率")]),s._v(" "),t("li",[s._v("任务在 running 或者 pending 都会占用相应 quota")]),s._v(" "),t("li",[s._v("Quota 将以CPU，内存，不同加速硬件来划分")])])]),s._v(" "),t("li",[s._v("Project 资源：\n"),t("ul",[t("li",[s._v("用户在 project 内调度任务不受 quota 限制，只会受 project 资源数量限制")]),s._v(" "),t("li",[s._v("用户在 project 内的任务全 project 共享，可以共同对 project 内任务进行操作")])])]),s._v(" "),t("li",[s._v("任务优先级：用户调度任务时可以选择 normal 和 low 两种优先级\n"),t("ul",[t("li",[s._v("low 任务可以在整个集群上调度，不受 quota 的限制")]),s._v(" "),t("li",[s._v("normal 任务可以在 project 或 public 资源中调度")]),s._v(" "),t("li",[s._v("Normal 任务始终会抢占 low 任务")])])])]),s._v(" "),t("h4",{attrs:{id:"_1-资源划分"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-资源划分"}},[s._v("#")]),s._v(" 1）资源划分")]),s._v(" "),t("p",[s._v("平台所调度的机器资源主要分为含加速硬件节点和 cpu 节点两类")]),s._v(" "),t("ul",[t("li",[s._v("加速硬件节点：指带 GPU 节点或端侧开发板等带加速硬件的节点\n"),t("ul",[t("li",[s._v("申请任务时需选择需要申请的加速硬件类型")]),s._v(" "),t("li",[s._v("这类节点在申请时，节点的 CPU，内存数会和加速硬件个数绑定")]),s._v(" "),t("li",[s._v("例如：A100节点可调度的资源有 8 块 gpu，56 核 cpu，944 GB 内存于是每 1 块卡就和 7 核 cpu、118 GB 内存绑定")])])]),s._v(" "),t("li",[s._v("CPU节点： 指只带 CPU 和内存的节点\n"),t("ul",[t("li",[s._v("这类节点申请时只受单个节点最大 CPU 和内存数限制")])])]),s._v(" "),t("li",[s._v("在申请节点时可以通过加上 tag 来指定调度相应设备的节点（例如：通过 tag:A100 去专门调度 A100机器）")])]),s._v(" "),t("table",[t("thead",[t("tr",[t("th",[s._v("资源池")]),s._v(" "),t("th",[s._v("适用任务")]),s._v(" "),t("th",[s._v("受 Quota 限制")]),s._v(" "),t("th",[s._v("资源类型")]),s._v(" "),t("th",[s._v("适用场景")])])]),s._v(" "),t("tbody",[t("tr",[t("td",[t("strong",[s._v("Public 资源池")])]),s._v(" "),t("td",[s._v("共享任务、训练任务")]),s._v(" "),t("td",[s._v("✅ 受限")]),s._v(" "),t("td",[t("strong",[s._v("CPU / GPU / NPU / 端侧设备")])]),s._v(" "),t("td",[s._v("共享计算资源、实验任务")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("Project 资源池")])]),s._v(" "),t("td",[s._v("项目独占任务")]),s._v(" "),t("td",[s._v("❌ 无限制")]),s._v(" "),t("td",[t("strong",[s._v("CPU / GPU / NPU")])]),s._v(" "),t("td",[s._v("业务核心任务，避免资源抢占")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("加速硬件资源池")])]),s._v(" "),t("td",[s._v("仅加速任务")]),s._v(" "),t("td",[s._v("✅ 受限")]),s._v(" "),t("td",[t("strong",[s._v("A100 / V100 / 端侧设备")])]),s._v(" "),t("td",[s._v("机器学习、深度学习推理")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("CPU 资源池")])]),s._v(" "),t("td",[s._v("仅 CPU 任务")]),s._v(" "),t("td",[s._v("✅ 受限")]),s._v(" "),t("td",[t("strong",[s._v("CPU + 内存")])]),s._v(" "),t("td",[s._v("传统计算任务、调度系统任务")])])])]),s._v(" "),t("h4",{attrs:{id:"_2-资源申请"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-资源申请"}},[s._v("#")]),s._v(" 2）资源申请")]),s._v(" "),t("ul",[t("li",[t("strong",[s._v("任务提交")]),s._v("：\n"),t("ul",[t("li",[s._v("检查 "),t("code",[s._v("Quota")]),s._v(" 余额，确定可用资源池")]),s._v(" "),t("li",[s._v("确认任务需求（CPU / GPU / 端侧设备）")]),s._v(" "),t("li",[s._v("若申请 "),t("code",[s._v("GPU")]),s._v(" 资源，检查 "),t("strong",[s._v("绑定的 CPU & 内存是否足够")])])])]),s._v(" "),t("li",[t("strong",[s._v("资源选择")]),s._v("：\n"),t("ul",[t("li",[t("strong",[s._v("优先调度 Project 资源")]),s._v("（不受 Quota 限制）")]),s._v(" "),t("li",[s._v("若 Project 资源不足，则尝试 "),t("strong",[s._v("Public 资源")])]),s._v(" "),t("li",[s._v("若 Public 资源也不足，则任务 "),t("strong",[s._v("进入 Pending 队列")]),s._v("，等待抢占或资源释放")])])]),s._v(" "),t("li",[t("strong",[s._v("加速硬件调度策略")]),s._v(" "),t("ul",[t("li",[s._v("若任务请求 GPU，调度系统将按照"),t("code",[s._v("GPU 绑定的 CPU & 内存分配")])]),s._v(" "),t("li",[s._v("Eg: 任务申请 "),t("strong",[s._v("A100")]),s._v(" 资源，则调度 "),t("code",[s._v("7 核 CPU + 118 GB 内存 + 1 GPU")])])])])]),s._v(" "),t("h4",{attrs:{id:"_3-任务优先级"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3-任务优先级"}},[s._v("#")]),s._v(" 3）任务优先级")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务调度优先级")]),s._v(" "),t("ul",[t("li",[t("table",[t("thead",[t("tr",[t("th",[s._v("任务类型")]),s._v(" "),t("th",[s._v("资源限制")]),s._v(" "),t("th",[s._v("是否抢占 low 任务")]),s._v(" "),t("th",[s._v("适用场景")])])]),s._v(" "),t("tbody",[t("tr",[t("td",[t("strong",[s._v("High（高优先级）")])]),s._v(" "),t("td",[s._v("仅 Project 资源")]),s._v(" "),t("td",[s._v("❌ 不抢占")]),s._v(" "),t("td",[s._v("关键业务任务")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("Normal（默认）")])]),s._v(" "),t("td",[s._v("可用 Project & Public 资源")]),s._v(" "),t("td",[s._v("✅ 可抢占 low 任务")]),s._v(" "),t("td",[s._v("训练、推理任务")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("Low（低优先级）")])]),s._v(" "),t("td",[s._v("仅 Public 空闲资源")]),s._v(" "),t("td",[s._v("❌ 可能被 normal 抢占")]),s._v(" "),t("td",[s._v("可中断任务，如批量任务")])])])])])])]),s._v(" "),t("li",[t("p",[s._v("任务抢占逻辑")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("Normal 任务可抢占 Low 任务，确保资源合理利用")])]),s._v(" "),t("li",[t("p",[s._v("Low 任务仅能使用闲置资源，不会影响正常业务")])])])])]),s._v(" "),t("h4",{attrs:{id:"_4-quota-计算"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_4-quota-计算"}},[s._v("#")]),s._v(" 4）Quota 计算")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("在 K8s 资源调度中，每个用户或项目的 Quota 限制 "),t("code",[s._v("按不同资源类型")]),s._v(" 计算")]),s._v(" "),t("ul",[t("li",[t("code",[s._v("① CPU-only 任务")]),s._v(" → 仅计算 CPU & 内存")]),s._v(" "),t("li",[t("code",[s._v("② GPU 任务")]),s._v(" → 计算 GPU 数量，同时绑定 "),t("strong",[s._v("CPU & 内存")])]),s._v(" "),t("li",[t("code",[s._v("③ 端侧设备任务")]),s._v(" → 直接计算设备数量")])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("Quota 计算规则")])]),s._v(" "),t("ul",[t("li",[t("table",[t("thead",[t("tr",[t("th",[s._v("资源类型")]),s._v(" "),t("th",[s._v("计算方式")]),s._v(" "),t("th",[s._v("是否受配额限制")]),s._v(" "),t("th",[s._v("示例")])])]),s._v(" "),t("tbody",[t("tr",[t("td",[t("strong",[s._v("GPU")])]),s._v(" "),t("td",[s._v("按 GPU 数量绑定 CPU & 内存")]),s._v(" "),t("td",[s._v("✅ 是")]),s._v(" "),t("td",[s._v("1 块 A100 = 7 核 CPU + 118 GB 内存")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("CPU")])]),s._v(" "),t("td",[s._v("直接计算 CPU 核数 & 内存大小")]),s._v(" "),t("td",[s._v("✅ 是")]),s._v(" "),t("td",[s._v("32 核 CPU + 64 GB 内存")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("端侧设备")])]),s._v(" "),t("td",[s._v("直接计算设备数量")]),s._v(" "),t("td",[s._v("✅ 是")]),s._v(" "),t("td",[s._v("Jetson Xavier 2 台")])])])])])])]),s._v(" "),t("li",[t("p",[t("code",[s._v("eg：GPU 任务")]),s._v(" (申请 "),t("code",[s._v("A100-80G 2 块")]),s._v(")")]),s._v(" "),t("ul",[t("li",[s._v("每块 A100 绑定 "),t("code",[s._v("7 核 CPU + 118 GB 内存")])])])])]),s._v(" "),t("h3",{attrs:{id:"_2、任务抢占"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2、任务抢占"}},[s._v("#")]),s._v(" 2、任务抢占")]),s._v(" "),t("h4",{attrs:{id:"_1-任务抢占实现"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_1-任务抢占实现"}},[s._v("#")]),s._v(" 1）任务抢占实现")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("任务抢占的核心在于 "),t("code",[s._v("K8s 自带的优先级调度机制")])])]),s._v(" "),t("li",[t("p",[s._v("当高优先级任务需要资源时，"),t("code",[s._v("K8s 会自动驱逐低优先级任务")]),s._v("，释放资源供高优先级任务运行")])]),s._v(" "),t("li",[t("p",[t("code",[s._v("① K8s 使用 PriorityClass 机制，定义不同任务的优先级")])]),s._v(" "),t("ul",[t("li",[t("code",[s._v("value")]),s._v(" 值越大，优先级越高")]),s._v(" "),t("li",[t("code",[s._v("low-priority")]),s._v(" 任务容易被 "),t("code",[s._v("normal-priority")]),s._v(" 任务抢占")])])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" scheduling.k8s.io/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" PriorityClass\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" normal"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("value")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("1000")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# Normal 任务优先级较高")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("globalDefault")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token boolean important"}},[s._v("false")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("description")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"Normal priority jobs"')]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("---")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" scheduling.k8s.io/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" PriorityClass\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" low"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("value")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("500")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# Low 任务优先级较低，容易被抢占")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("globalDefault")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token boolean important"}},[s._v("false")]),s._v("\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("description")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"Low priority jobs"')]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br")])]),t("ul",[t("li",[t("code",[s._v("② 高优先级任务（Normal）")])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" normal"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("task\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("priorityClassName")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" normal"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" job\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" job"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("image\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("resources")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("limits")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("cpu")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"16"')]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("memory")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"128Gi"')]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br")])]),t("ul",[t("li",[t("code",[s._v("③ 低优先级任务（Low，可能被抢占）")])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" low"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("task\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("priorityClassName")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" low"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" job\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" job"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("image\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("resources")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n          "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("limits")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("cpu")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"16"')]),s._v("\n            "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("memory")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token string"}},[s._v('"128Gi"')]),s._v("\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br"),t("span",{staticClass:"line-number"},[s._v("13")]),t("br"),t("span",{staticClass:"line-number"},[s._v("14")]),t("br"),t("span",{staticClass:"line-number"},[s._v("15")]),t("br")])]),t("h4",{attrs:{id:"_2-被抢占任务处理"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_2-被抢占任务处理"}},[s._v("#")]),s._v(" 2）被抢占任务处理")]),s._v(" "),t("ul",[t("li",[t("code",[s._v("方案 1：Pending 超时自动失败")]),s._v(" "),t("ul",[t("li",[s._v("如果任务长时间 Pending，可以通过 "),t("code",[s._v("activeDeadlineSeconds")]),s._v(" 设置超时失败")]),s._v(" "),t("li",[s._v("任务 Pending 超过 30 分钟会自动失败，避免积压任务")])])])]),s._v(" "),t("div",{staticClass:"language-yaml line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-yaml"}},[t("code",[t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("apiVersion")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" batch/v1\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("kind")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" Job\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("metadata")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" low"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("task\n"),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("activeDeadlineSeconds")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token number"}},[s._v("1800")]),s._v("  "),t("span",{pre:!0,attrs:{class:"token comment"}},[s._v("# 30 分钟超时")]),s._v("\n  "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("template")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n    "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("spec")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("priorityClassName")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" low"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("priority\n      "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("containers")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v("\n      "),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("name")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" job\n        "),t("span",{pre:!0,attrs:{class:"token key atrule"}},[s._v("image")]),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v(":")]),s._v(" task"),t("span",{pre:!0,attrs:{class:"token punctuation"}},[s._v("-")]),s._v("image\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br"),t("span",{staticClass:"line-number"},[s._v("3")]),t("br"),t("span",{staticClass:"line-number"},[s._v("4")]),t("br"),t("span",{staticClass:"line-number"},[s._v("5")]),t("br"),t("span",{staticClass:"line-number"},[s._v("6")]),t("br"),t("span",{staticClass:"line-number"},[s._v("7")]),t("br"),t("span",{staticClass:"line-number"},[s._v("8")]),t("br"),t("span",{staticClass:"line-number"},[s._v("9")]),t("br"),t("span",{staticClass:"line-number"},[s._v("10")]),t("br"),t("span",{staticClass:"line-number"},[s._v("11")]),t("br"),t("span",{staticClass:"line-number"},[s._v("12")]),t("br")])]),t("ul",[t("li",[t("p",[t("code",[s._v("其他方案")])]),s._v(" "),t("table",[t("thead",[t("tr",[t("th",[t("strong",[s._v("问题")])]),s._v(" "),t("th",[t("strong",[s._v("解决方案")])]),s._v(" "),t("th",[t("strong",[s._v("实现方式")])])])]),s._v(" "),t("tbody",[t("tr",[t("td",[t("strong",[s._v("高优先级任务抢占低优先级任务")])]),s._v(" "),t("td",[t("strong",[s._v("使用 "),t("code",[s._v("PriorityClass")]),s._v(" 机制")])]),s._v(" "),t("td",[t("code",[s._v("priorityClassName")])])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("Pending 任务长时间无资源")])]),s._v(" "),t("td",[t("strong",[s._v("超时失败，重新提交")])]),s._v(" "),t("td",[t("code",[s._v("activeDeadlineSeconds")])])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("资源紧张导致任务一直等待")])]),s._v(" "),t("td",[t("strong",[s._v("降级 GPU/CPU 需求，重新调度")])]),s._v(" "),t("td",[s._v("调度系统修改 Job 资源")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("集群资源不足")])]),s._v(" "),t("td",[t("strong",[s._v("迁移任务到其他集群")])]),s._v(" "),t("td",[s._v("调度系统跨集群调度")])]),s._v(" "),t("tr",[t("td",[t("strong",[s._v("过期任务积压")])]),s._v(" "),t("td",[t("strong",[s._v("智能回收无用任务")])]),s._v(" "),t("td",[s._v("调度系统定期清理")])])])])])]),s._v(" "),t("h3",{attrs:{id:"_3、日志收集"}},[t("a",{staticClass:"header-anchor",attrs:{href:"#_3、日志收集"}},[s._v("#")]),s._v(" 3、日志收集")]),s._v(" "),t("ul",[t("li",[t("p",[s._v("确定日志存储路径")]),s._v(" "),t("ul",[t("li",[s._v("对于 containerd 运行时："),t("code",[s._v("/var/log/pods/{namespace}_{pod-name}_{pod-uid}/{container-name}/0.log")])]),s._v(" "),t("li",[s._v("对于 Docker 运行时："),t("code",[s._v("/var/lib/docker/containers/{container-id}/*.log")])]),s._v(" "),t("li",[s._v("可以使用 "),t("code",[s._v("kubectl get pod -o wide")]),s._v(" 获取 Pod 运行的 Node，并登录对应 Node")])])]),s._v(" "),t("li",[t("p",[t("strong",[s._v("安装 Vector")]),s._v(" 在 Kubernetes Node 上安装 Vector")]),s._v(" "),t("ul",[t("li",[t("div",{staticClass:"language-bash line-numbers-mode"},[t("pre",{pre:!0,attrs:{class:"language-bash"}},[t("code",[t("span",{pre:!0,attrs:{class:"token function"}},[s._v("curl")]),s._v(" "),t("span",{pre:!0,attrs:{class:"token parameter variable"}},[s._v("-L")]),s._v(" https://packages.timber.io/vector/latest/vector-amd64.deb "),t("span",{pre:!0,attrs:{class:"token parameter variable"}},[s._v("-o")]),s._v(" vector.deb\n"),t("span",{pre:!0,attrs:{class:"token function"}},[s._v("sudo")]),s._v(" dpkg "),t("span",{pre:!0,attrs:{class:"token parameter variable"}},[s._v("-i")]),s._v(" vector.deb\n")])]),s._v(" "),t("div",{staticClass:"line-numbers-wrapper"},[t("span",{staticClass:"line-number"},[s._v("1")]),t("br"),t("span",{staticClass:"line-number"},[s._v("2")]),t("br")])])])])]),s._v(" "),t("li",[t("p",[s._v("优缺点")])])]),s._v(" "),t("table",[t("thead",[t("tr",[t("th",[s._v("优势")]),s._v(" "),t("th",[s._v("缺点")])])]),s._v(" "),t("tbody",[t("tr",[t("td",[s._v("高效、资源占用低")]),s._v(" "),t("td",[s._v("需要手动安装 Vector")])]),s._v(" "),t("tr",[t("td",[s._v("可本地处理日志，减少 ClickHouse 压力")]),s._v(" "),t("td",[s._v("配置较复杂")])]),s._v(" "),t("tr",[t("td",[s._v("支持 ClickHouse、Kafka、Loki 等多个存储")]),s._v(" "),t("td",[s._v("需要调优以避免数据丢失")])])])])])}),[],!1,null,null,null);t.default=e.exports}}]);