Rework loading revision data

These changes were motivated by switching to a mechanism of loading data that
isn't dependent on the big advisory lock that prevents more than one revision
from being processed at a time.

Since INSERT ... RETURNING id; is used, this can block if another transaction
inserts the same data, and then cause an error when that transaction
commits. The solution is to use ON CONFLICT DO NOTHING, but you have to handle
the case when the INSERT doesn't return an id since the other transaction has
inserted it.

This commit rewrites insert-missing-data-and-return-all-ids to do as described
above, as well as being more efficient in how existing data is detected and to
use more vectors. Other utilities for inserting data are added as well.
This commit is contained in:
Christopher Baines 2024-12-05 20:56:23 +00:00
parent b128e9bd7a
commit 5ed98343d7
25 changed files with 874 additions and 1149 deletions

View file

@ -846,95 +846,84 @@
conn
(inferior-packages->license-id-lists
conn
;; TODO Don't needlessly convert
(vector->list
(assq-ref inferior-packages-data 'license-data))))))
(assq-ref inferior-packages-data 'license-data)))))
(all-package-metadata-ids
new-package-metadata-ids
(with-time-logging "inserting package metadata entries"
(inferior-packages->package-metadata-ids
conn
;; TODO Don't needlessly convert
(vector->list
(assq-ref inferior-packages-data 'metadata))
(assq-ref inferior-packages-data 'metadata)
package-license-set-ids)))
(replacement-package-ids
(vector-map
(lambda (_ package-index-or-false)
(if package-index-or-false
(first
(vector-ref
(inferior-packages->package-ids
conn
(list (list (vector-ref names package-index-or-false)
(vector-ref versions package-index-or-false)
(list-ref all-package-metadata-ids
package-index-or-false)
(cons "integer" NULL)))))
(vector
(list (vector-ref names package-index-or-false)
(vector-ref versions package-index-or-false)
(vector-ref all-package-metadata-ids
package-index-or-false)
(cons "integer" NULL))))
0)
(cons "integer" NULL)))
(assq-ref inferior-packages-data 'replacements))))
(unless (null? new-package-metadata-ids)
(unless (= 0 (vector-length new-package-metadata-ids))
(with-time-logging "inserting package metadata tsvector entries"
(insert-package-metadata-tsvector-entries
conn new-package-metadata-ids)))
(with-time-logging "getting package-ids (without replacements)"
(list->vector
(inferior-packages->package-ids
conn
;; TODO Do this more efficiently
(zip (vector->list names)
(vector->list versions)
all-package-metadata-ids
(vector->list replacement-package-ids)))))))
(inferior-packages->package-ids
conn
;; Similar to zip, but generating a vector of lists
(vector-map (lambda (index . vals) vals)
names
versions
all-package-metadata-ids
replacement-package-ids)))))
(define (insert-lint-warnings conn
package-ids
lint-checker-ids
lint-warnings-data)
(lint-warnings-data->lint-warning-ids
conn
(append-map!
(concatenate!
(filter-map
(lambda (lint-checker-id warnings-per-package)
(if warnings-per-package
(vector-fold
(lambda (_ result package-id warnings)
(append!
result
(map
(match-lambda
((location-data messages-by-locale)
(let ((location-id
(location->location-id
conn
(apply location location-data)))
(lint-warning-message-set-id
(lint-warning-message-data->lint-warning-message-set-id
conn
messages-by-locale)))
(list lint-checker-id
package-id
location-id
lint-warning-message-set-id))))
(fold (lambda (location-and-messages result)
;; TODO Sort to delete duplicates, rather than use member
(if (member location-and-messages result)
(begin
(apply
simple-format
(current-error-port)
"warning: skipping duplicate lint warning ~A ~A\n"
location-and-messages)
result)
(append! result
(list location-and-messages))))
'()
warnings))))
(if (null? warnings)
result
(cons
(lint-warnings-data->lint-warning-ids
conn
(list->vector
(map
(match-lambda
((location-data messages-by-locale)
(let ((location-id
(location->location-id
conn
(apply location location-data)))
(lint-warning-message-set-id
(lint-warning-message-data->lint-warning-message-set-id
conn
messages-by-locale)))
(list lint-checker-id
package-id
location-id
lint-warning-message-set-id))))
warnings)))
result)))
'()
package-ids
warnings-per-package)
'()))
lint-checker-ids
#f))
(vector->list lint-checker-ids)
lint-warnings-data)))
(define (update-derivation-ids-hash-table! conn
@ -977,30 +966,25 @@
#:key (log-tag "unspecified"))
(define (insert-into-derivations conn drvs)
(string-append
"INSERT INTO derivations "
"(file_name, builder, args, env_vars, system_id) VALUES "
(string-join
(map (match-lambda
(($ <derivation> outputs inputs sources
system builder args env-vars file-name)
(simple-format
#f "('~A', '~A', ARRAY[~A]::varchar[], ARRAY[~A], '~A')"
file-name
builder
(string-join (map quote-string args) ",")
(string-join (map (match-lambda
((key . value)
(string-append
"['" key '"', $$"
value "$$ ]")))
env-vars)
",")
(system->system-id conn system))))
drvs)
",")
" RETURNING id"
";"))
(insert-missing-data-and-return-all-ids
conn
"derivations"
'(file_name builder args env_vars system_id)
(vector-map (match-lambda*
((_ ($ <derivation> outputs inputs sources
system builder args env-vars file-name))
(list file-name
builder
(cons "varchar[]"
(list->vector args))
(cons "varchar[][]"
(list->vector
(map (match-lambda
((key . value)
(vector key value)))
env-vars)))
(system->system-id conn system))))
drvs)))
(define (insert-derivations)
(with-resource-from-pool postgresql-connection-pool conn
@ -1021,45 +1005,41 @@
(let ((derivations
;; Do this while holding the PostgreSQL connection to
;; avoid conflicts with other fibers
(delete-duplicates
(filter-map (lambda (derivation)
(if (hash-ref derivation-ids-hash-table
(derivation-file-name
derivation))
#f
derivation))
unfiltered-derivations))))
(if (null? derivations)
(values '() '())
(list->vector
(delete-duplicates
(filter-map (lambda (derivation)
(if (hash-ref derivation-ids-hash-table
(derivation-file-name
derivation))
#f
derivation))
unfiltered-derivations)))))
(if (= 0 (vector-length derivations))
(values #() #())
(begin
(simple-format
(current-error-port)
"insert-missing-derivations: inserting ~A derivations (~A)\n"
(length derivations)
(vector-length derivations)
log-tag)
(let ((derivation-ids
(append-map!
(lambda (chunk)
(map (lambda (result)
(string->number (car result)))
(exec-query conn (insert-into-derivations conn chunk))))
(chunk derivations 500))))
(insert-into-derivations conn derivations)))
;; Do this while holding the connection so that other
;; fibers don't also try inserting the same derivations
(with-time-logging
(string-append "insert-missing-derivations: updating hash table (" log-tag ")")
(for-each (lambda (derivation derivation-id)
(hash-set! derivation-ids-hash-table
(derivation-file-name derivation)
derivation-id))
(vector-for-each (lambda (_ derivation derivation-id)
(hash-set! derivation-ids-hash-table
(derivation-file-name derivation)
derivation-id))
derivations
derivation-ids))
(simple-format
(current-error-port)
"insert-missing-derivations: finished inserting ~A derivations (~A)\n"
(length derivations)
(vector-length derivations)
log-tag)
(values derivations
@ -1137,10 +1117,10 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
hash
compressed-nar-bytevector
uncompressed-size))))))))
sources-ids
(vector->list sources-ids)
sources)))))
derivation-ids
derivations)))
(vector->list derivation-ids)
(vector->list derivations))))
(let ((derivations
derivation-ids
@ -1154,12 +1134,13 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(string-append "insert-missing-derivations: inserting outputs ("
log-tag ")")
(with-resource-from-pool postgresql-connection-pool conn
(for-each (lambda (derivation-id derivation)
(insert-derivation-outputs conn
derivation-id
(derivation-outputs derivation)))
derivation-ids
derivations)))
(vector-for-each
(lambda (_ derivation-id derivation)
(insert-derivation-outputs conn
derivation-id
(derivation-outputs derivation)))
derivation-ids
derivations)))
(with-time-logging
(string-append
@ -1169,7 +1150,7 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(map
derivation-input-derivation
(append-map derivation-inputs
derivations))))
(vector->list derivations)))))
(unless (null? input-derivations)
;; Ensure all the input derivations exist
(for-each
@ -1182,12 +1163,14 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
#:log-tag log-tag))
(chunk! input-derivations 1000))))))
(string-append "insert-missing-derivations: done parallel (" log-tag ")")
(simple-format
(current-error-port)
"debug: insert-missing-derivations: done parallel (~A)\n" log-tag)
(with-resource-from-pool postgresql-connection-pool conn
(with-time-logging
(simple-format
#f "insert-missing-derivations: inserting inputs for ~A derivations (~A)"
(length derivations)
(vector-length derivations)
log-tag)
(insert-derivation-inputs conn
derivation-ids
@ -1913,25 +1896,28 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(define inferior-lint-checkers-data
(call-with-inferior
(lambda (inferior inferior-store)
(inferior-lint-checkers inferior))))
(list->vector
(inferior-lint-checkers inferior)))))
(when inferior-lint-checkers-data
(letpar& ((lint-checker-ids
(with-resource-from-pool postgresql-connection-pool conn
(lint-checkers->lint-checker-ids
conn
(map (match-lambda
((name descriptions-by-locale network-dependent)
(list
name
network-dependent
;; Uses of sort may cause problems
(call-with-worker-thread
utility-thread-channel
(lambda ()
(lint-checker-description-data->lint-checker-description-set-id
conn descriptions-by-locale))))))
inferior-lint-checkers-data))))
(vector-map
(match-lambda*
((_ (name descriptions-by-locale network-dependent))
(list
name
network-dependent
;; Uses of sort may cause problems
(call-with-worker-thread
utility-thread-channel
(lambda ()
(lint-checker-description-data->lint-checker-description-set-id
conn
descriptions-by-locale))))))
inferior-lint-checkers-data))))
(lint-warnings-data
(fibers-map
(match-lambda
@ -1946,7 +1932,8 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(inferior-lint-warnings inferior
inferior-store
checker-name)))))))
inferior-lint-checkers-data)))
(vector->list
inferior-lint-checkers-data))))
(let ((package-ids (fibers-force package-ids-promise)))
(with-resource-from-pool postgresql-connection-pool conn
@ -1955,20 +1942,21 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(fibers-force guix-revision-id-promise)
lint-checker-ids)
(let ((lint-warning-ids
(insert-lint-warnings
(let ((lint-warning-id-vectors
(with-time-logging "inserting lint warnings"
(insert-lint-warnings
conn
package-ids
lint-checker-ids
lint-warnings-data))))
(with-time-logging "inserting guix revision lint warnings"
(for-each
(lambda (lint-warning-ids)
(insert-guix-revision-lint-warnings
conn
package-ids
lint-checker-ids
lint-warnings-data)))
(chunk-for-each!
(lambda (lint-warning-ids-chunk)
(insert-guix-revision-lint-warnings
conn
(fibers-force guix-revision-id-promise)
lint-warning-ids-chunk))
5000
lint-warning-ids)))))))
(fibers-force guix-revision-id-promise)
lint-warning-ids))
lint-warning-id-vectors))))))))
(define (extract-and-store-package-derivations)
(define packages-count
@ -2113,7 +2101,8 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
guix-revision-id
package-derivation-ids-chunk)))
2000
package-derivation-ids)))
;; TODO Chunk more efficiently
(vector->list package-derivation-ids))))
'finished)
@ -2386,7 +2375,7 @@ SELECT 1 FROM derivation_source_file_nars WHERE derivation_source_file_id = $1"
(with-time-logging "updating builds.derivation_output_details_set_id"
(update-builds-derivation-output-details-set-id
conn
(string->number (fibers-force guix-revision-id-promise)))))
(fibers-force guix-revision-id-promise))))
(begin
(simple-format #t "Failed to generate store item for ~A\n"
commit)