Try to further speed up inserting missing derivation source files

Switch from using a recursive query to doing a breath first search through the
graph of derivations, as I think PostgreSQL wasn't doing a great job of
planning the recursive queries (it would overestimate the rows involved, and
prefer sequential scans for the derivation_outputs table).
This commit is contained in:
Christopher Baines 2022-03-02 18:00:36 +00:00
parent c5b504e94a
commit f86657915e

View file

@ -1738,37 +1738,38 @@ WHERE " criteria ";"))
(chunk! missing-file-names 2000))))) (chunk! missing-file-names 2000)))))
(define (derivation-file-names->derivation-ids conn derivation-file-names) (define (derivation-file-names->derivation-ids conn derivation-file-names)
(define (select-source-files-missing-nars! derivation-ids) (define (insert-source-files-missing-nars derivation-ids)
(define (derivation-ids->all-related-derivation-ids ids) (define (derivation-ids->next-related-derivation-ids! ids seen-ids)
(define query (delete-duplicates/sort!
(string-append (append-map!
" (lambda (ids-chunk)
WITH RECURSIVE all_derivations AS ( (let ((query
SELECT column1 AS derivation_id (string-append
FROM (VALUES " "
(string-join (map SELECT derivation_outputs.derivation_id
(lambda (id) FROM derivation_inputs
(string-append "(" id ")")) INNER JOIN derivation_outputs
(map number->string ids)) ON derivation_outputs.id = derivation_inputs.derivation_output_id
", ") WHERE derivation_inputs.derivation_id IN ("
") AS data (string-join (map number->string ids) ",")
UNION ")")))
SELECT derivation_outputs.derivation_id
FROM all_derivations
INNER JOIN derivation_inputs
ON derivation_inputs.derivation_id = all_derivations.derivation_id
INNER JOIN derivation_outputs
ON derivation_outputs.id = derivation_inputs.derivation_output_id
)
SELECT all_derivations.derivation_id
FROM all_derivations"))
(map (lambda (row) (filter-map
(string->number (lambda (row)
(car row))) (let ((number
(with-time-logging (string->number
"querying for batch of all related derivation ids" (car row))))
(exec-query conn query)))) (if (hash-ref seen-ids number)
#f
(begin
(hash-set! seen-ids number #t)
number))))
(with-time-logging
"querying for batch of all related derivation ids"
(exec-query conn query)))))
(chunk! ids 2000))
<))
(define (derivation-ids->missing-sources ids) (define (derivation-ids->missing-sources ids)
(define query (define query
@ -1787,21 +1788,42 @@ INNER JOIN derivation_source_files
") ")
AND derivation_source_file_nars.derivation_source_file_id IS NULL")) AND derivation_source_file_nars.derivation_source_file_id IS NULL"))
(with-time-logging "finding batch of missing sources" (map (lambda (row)
(exec-query conn query))) (list (string->number (first row))
(second row)))
(with-time-logging "finding batch of missing sources"
(exec-query conn query))))
(let ((all-derivation-ids (let ((seen-ids (make-hash-table)))
(with-time-logging "querying for all related dervation ids" (let loop ((next-related-derivation-ids
(delete-duplicates/sort! (with-time-logging "querying for next related dervation ids"
(append-map! (derivation-ids->next-related-derivation-ids!
derivation-ids->all-related-derivation-ids (list-copy derivation-ids)
(chunk! derivation-ids 5000)) seen-ids))))
<)))) (unless (null? next-related-derivation-ids)
(let ((missing-sources
(with-time-logging "querying for missing sources"
(append-map! derivation-ids->missing-sources
(chunk next-related-derivation-ids
10000)))))
(with-time-logging "querying for missing sources" (unless (null? missing-sources)
(append-map! derivation-ids->missing-sources (with-time-logging
(chunk! all-derivation-ids (simple-format #f "inserting ~A missing source files"
10000))))) (length missing-sources))
(for-each (match-lambda
((derivation-source-file-id store-path)
(insert-derivation-source-file-nar
conn
derivation-source-file-id
store-path)))
missing-sources))))
(loop
(with-time-logging "querying for next related dervation ids"
(derivation-ids->next-related-derivation-ids!
next-related-derivation-ids
seen-ids)))))))
(if (null? derivation-file-names) (if (null? derivation-file-names)
'() '()
@ -1854,13 +1876,7 @@ INNER JOIN derivation_source_files
(error "missing derivation id"))) (error "missing derivation id")))
derivation-file-names))) derivation-file-names)))
(with-time-logging "inserting missing source files" (with-time-logging "insert-source-files-missing-nars"
(for-each (match-lambda (insert-source-files-missing-nars all-ids))
((derivation-source-file-id store-path)
(insert-derivation-source-file-nar
conn
(string->number derivation-source-file-id)
store-path)))
(select-source-files-missing-nars! all-ids)))
all-ids))))) all-ids)))))