te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>machine learning - Is there any difference between these two implementations of LoRA (Low-Rank Adaptation)? - Stack Overflow

你的位置：首页>programmer>machine learning - Is there any difference between these two implementations of LoRA (Low-Rank Adaptation)? - Stack Overflow

machine learning - Is there any difference between these two implementations of LoRA (Low-Rank Adaptation)? - Stack Overflow

programmeradmin2025-02-212浏览0评论

We all know that LoRA is a low-rank adaptation method, which can be formulated as follows: x = W_0 * x + (A @ B) * x. I have two different code implementations of this. Are there any differences between them?

Code 1:

def forward(self, x):
    x = x @ self.lora_A
    x = x @ self.lora_B
    x = self.scaling * x
    return x

Code 2:

def forward(self, x):
    x = x @ (self.lora_A @ self.lora_B)
    x = self.scaling * x
    return x

From a mathematical perspective, both seem equivalent. However, when I run both implementations on a toy dataset, I observe a very slight difference in their performance—Code 2 performs slightly better. Why this slight difference might occur? Is there any underlying computational or optimization nuance that could explain this?

I’m not completely sure if both implementations are correct. I often see Code 1 in GitHub repositories, but I’ve noticed that Code 2 performs slightly better. Why this might be the case?

与本文相关的文章

评论列表(0)

暂无评论

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

与本文相关的文章

评论列表(0)