Try to further speed up inserting missing derivation source files

Switch from using a recursive query to doing a breath first search through the
graph of derivations, as I think PostgreSQL wasn't doing a great job of
planning the recursive queries (it would overestimate the rows involved, and
prefer sequential scans for the derivation_outputs table).
This commit is contained in:
Christopher Baines 2022-03-02 18:00:36 +00:00
parent c5b504e94a
commit f86657915e

View file

@ -1738,37 +1738,38 @@ WHERE " criteria ";"))
(chunk! missing-file-names 2000)))))
(define (derivation-file-names->derivation-ids conn derivation-file-names)
(define (select-source-files-missing-nars! derivation-ids)
(define (derivation-ids->all-related-derivation-ids ids)
(define query
(string-append
"
WITH RECURSIVE all_derivations AS (
SELECT column1 AS derivation_id
FROM (VALUES "
(string-join (map
(lambda (id)
(string-append "(" id ")"))
(map number->string ids))
", ")
") AS data
UNION
SELECT derivation_outputs.derivation_id
FROM all_derivations
INNER JOIN derivation_inputs
ON derivation_inputs.derivation_id = all_derivations.derivation_id
INNER JOIN derivation_outputs
ON derivation_outputs.id = derivation_inputs.derivation_output_id
)
SELECT all_derivations.derivation_id
FROM all_derivations"))
(define (insert-source-files-missing-nars derivation-ids)
(define (derivation-ids->next-related-derivation-ids! ids seen-ids)
(delete-duplicates/sort!
(append-map!
(lambda (ids-chunk)
(let ((query
(string-append
"
SELECT derivation_outputs.derivation_id
FROM derivation_inputs
INNER JOIN derivation_outputs
ON derivation_outputs.id = derivation_inputs.derivation_output_id
WHERE derivation_inputs.derivation_id IN ("
(string-join (map number->string ids) ",")
")")))
(map (lambda (row)
(string->number
(car row)))
(with-time-logging
"querying for batch of all related derivation ids"
(exec-query conn query))))
(filter-map
(lambda (row)
(let ((number
(string->number
(car row))))
(if (hash-ref seen-ids number)
#f
(begin
(hash-set! seen-ids number #t)
number))))
(with-time-logging
"querying for batch of all related derivation ids"
(exec-query conn query)))))
(chunk! ids 2000))
<))
(define (derivation-ids->missing-sources ids)
(define query
@ -1787,21 +1788,42 @@ INNER JOIN derivation_source_files
")
AND derivation_source_file_nars.derivation_source_file_id IS NULL"))
(with-time-logging "finding batch of missing sources"
(exec-query conn query)))
(map (lambda (row)
(list (string->number (first row))
(second row)))
(with-time-logging "finding batch of missing sources"
(exec-query conn query))))
(let ((all-derivation-ids
(with-time-logging "querying for all related dervation ids"
(delete-duplicates/sort!
(append-map!
derivation-ids->all-related-derivation-ids
(chunk! derivation-ids 5000))
<))))
(let ((seen-ids (make-hash-table)))
(let loop ((next-related-derivation-ids
(with-time-logging "querying for next related dervation ids"
(derivation-ids->next-related-derivation-ids!
(list-copy derivation-ids)
seen-ids))))
(unless (null? next-related-derivation-ids)
(let ((missing-sources
(with-time-logging "querying for missing sources"
(append-map! derivation-ids->missing-sources
(chunk next-related-derivation-ids
10000)))))
(with-time-logging "querying for missing sources"
(append-map! derivation-ids->missing-sources
(chunk! all-derivation-ids
10000)))))
(unless (null? missing-sources)
(with-time-logging
(simple-format #f "inserting ~A missing source files"
(length missing-sources))
(for-each (match-lambda
((derivation-source-file-id store-path)
(insert-derivation-source-file-nar
conn
derivation-source-file-id
store-path)))
missing-sources))))
(loop
(with-time-logging "querying for next related dervation ids"
(derivation-ids->next-related-derivation-ids!
next-related-derivation-ids
seen-ids)))))))
(if (null? derivation-file-names)
'()
@ -1854,13 +1876,7 @@ INNER JOIN derivation_source_files
(error "missing derivation id")))
derivation-file-names)))
(with-time-logging "inserting missing source files"
(for-each (match-lambda
((derivation-source-file-id store-path)
(insert-derivation-source-file-nar
conn
(string->number derivation-source-file-id)
store-path)))
(select-source-files-missing-nars! all-ids)))
(with-time-logging "insert-source-files-missing-nars"
(insert-source-files-missing-nars all-ids))
all-ids)))))