@inproceedings{8807d5e1933b484fbed0ad9cf8176a31,
title = "CIM for Transformer Models: Enhancing Large Language Model Inference Efficiency",
abstract = "In the field of large language model (LLM) inference, the high computational demand and extensive memory requirements for weights and key-value (KV) cache storage present significant challenges. This issue becomes especially problematic when relying exclusively on GPUs, as they often lack the capacity to accommodate the entire KV cache, particularly in larger LLMs. In the absence of direct communications like NVlink among multiple GPUs, LLMs typically require offloading the KV cache to the CPU for storage and computation, followed by transferring the multi-head attention results back to the GPU for subsequent transformer computations. Given that attention score computation is computationally demanding on the CPU and requires substantial data movement between KV caches and memory, the direct computation of attention scores and even the feedforward layers on Compute-in-Memory (CIM) systems emerges as a viable alternative. This paper is at the forefront of integrating CIM technology in LLM inference, and proposes an innovative architecture that leverages this emerging technology to enhance inference efficiency. Specifically, we present a tailored CIM-based dataflow and hierarchy design for optimize the computation of attention scores and feed-forward layers using CIMs. The results show improvements in performance, with 0.026 × inference latency and 1.199 × 10-3 × energy as compared to a CPU-based implementation.",
author = "Li, \{Meng Syuan\} and Ke, \{Jung Fang\} and Huang, \{En Ming\} and Liu, \{Zhi Wei\} and Chen, \{Yu Guang\} and Lee, \{Chun Yi\}",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 28th IEEE Computer Society Annual Symposium on VLSI, ISVLSI 2025 ; Conference date: 06-07-2025 Through 09-07-2025",
year = "2025",
doi = "10.1109/ISVLSI65124.2025.11130222",
language = "???core.languages.en\_GB???",
series = "Proceedings of IEEE Computer Society Annual Symposium on VLSI, ISVLSI",
publisher = "IEEE Computer Society",
booktitle = "IEEE Computer Society Annual Symposium on VLSI, ISVLSI 2025 - Conference Proceedings",
}