diff options
author | merakor <cem@ckyln.com> | 2021-02-04 09:42:49 +0000 |
---|---|---|
committer | merakor <cem@ckyln.com> | 2021-02-04 09:42:49 +0000 |
commit | 37d93c5413b1025c8490b57edf2dea821320d2ca (patch) | |
tree | e48f5214c88cb2c032b99cbbe0fa492b6d7e5a56 /src | |
parent | 492d8cca79b1cb392aa585bce2e269f40dba070a (diff) | |
download | cpt-37d93c5413b1025c8490b57edf2dea821320d2ca.tar.gz |
pkg_extract(): Add support for pax and add new methods
Even though we had a portable extraction method for tarballs, this slowed
down the extraction immensely. I have added 2 new methods for tarball
extraction so that we have faster extraction speeds based on the system
tar implementation. Here are the options with the order of preference:
- pax: Although an uncommon program, pax is defined by POSIX, and
supports regular expressions, meaning that we can 'strip' components.
This is slightly faster than bsdtar and GNU tar.
- bsdtar/gnutar: We simply use '--strip-components 1' for these tar
implementations.
- tar: This is the portable method we had been using for a while, it's
good, gets the job done, but it is slower.
Here is also a little benchmarking I did by extracting the Linux tarball
for 10 times per method.
tarball=$CPT_CACHE/sources/linux-headers/linux-5.10.12.tar.xz
for extract in pax bsdtar tar; do
time sh -c ". cpt-lib
extract=$extract
for i in \$(seq 10); do
mkdir -p hdr; cd hdr;
tar_extract \"$tarball\"; cd ..;
rm -rf hdr
done"
done
This yields the following output, methods are as ordered above:
sh -c 75.21s user 11.11s system 115% cpu 1:14.90 total
sh -c 78.68s user 14.64s system 124% cpu 1:15.15 total
sh -c 99.88s user 69.99s system 119% cpu 2:21.70 total
FossilOrigin-Name: 45e3a9a03e7eff9b30b0e911d68d44068e8681ebf4501f271c8a686118f29243
Diffstat (limited to 'src')
-rw-r--r-- | src/cpt-lib.in | 126 |
1 files changed, 71 insertions, 55 deletions
diff --git a/src/cpt-lib.in b/src/cpt-lib.in index 5690ebe..8c7989e 100644 --- a/src/cpt-lib.in +++ b/src/cpt-lib.in @@ -479,6 +479,69 @@ sh256() { while read -r hash _; do printf '%s %s\n' "$hash" "$1"; done } +tar_extract() { + # Tarball extraction function that prefers pax(1) over tar(1). The reason we + # are preferring pax is that we can strip components without relying on + # ugly hacks such as the ones we are doing for 'tar'. Using 'tar' means that + # we either have to sacrifice speed or portability, and we are choosing to + # sacrifice speed. Fortunately, we don't have to make such a choice when + # using pax. + case "${extract##*/}" in + pax) decompress "$1" | pax -r -s '/[^\/]*\///' ;; + gtar|bsdtar) decompress "$1" | "$tar" xf - --strip-components 1 ;; + tar) decompress "$1" > .ktar + + "$tar" xf .ktar || return + + # We now list the contents of the tarball so we can do our + # version of 'strip-components'. + "$tar" tf .ktar | + while read -r file; do printf '%s\n' "${file%%/*}"; done | + + # Do not repeat files. + uniq | + + # For every directory in the base we move each file + # inside it to the upper directory. + while read -r dir ; do + + # Skip if we are not dealing with a directory here. + # This way we don't remove files on the upper directory + # if a tar archive doesn't need directory stripping. + [ -d "${dir#.}" ] || continue + + # Change into the directory in a subshell so we don't + # need to cd back to the upper directory. + ( + cd "$dir" + + # We use find because we want to move hidden files + # as well. + # + # Skip the file if it has the same name as the directory. + # We will deal with it later. + # + # Word splitting is intentional here. + # shellcheck disable=2046 + find . \( ! -name . -prune \) ! -name "$dir" \ + -exec mv -f {} .. \; + + # If a file/directory with the same name as the directory + # exists, append a '.cptbak' to it and move it to the + # upper directory. + ! [ -e "$dir" ] || mv "$dir" "../${dir}.cptbak" + ) + rmdir "$dir" + + # If a backup file exists, move it into the original location. + ! [ -e "${dir}.cptbak" ] || mv "${dir}.cptbak" "$dir" + done + + # Clean up the temporary tarball. + rm -f .ktar + esac +} + pkg_owner() { set +f @@ -697,63 +760,10 @@ pkg_extract() { # extraction. Other filetypes are simply copied to '$mak_dir' # which allows for manual extraction. *://*.tar|*://*.tar.??|*://*.tar.???|*://*.tar.????|*://*.tgz|*://*.txz) - - decompress "$src_dir/$1/${src##*/}" > .ktar - - "$tar" xf .ktar || die "$1" "Couldn't extract ${src##*/}" - - # We now list the contents of the tarball so we can do our - # version of 'strip-components'. - "$tar" tf .ktar | - while read -r file; do printf '%s\n' "${file%%/*}"; done | - - # Do not repeat files. - uniq | - - # For every directory in the base we move each file - # inside it to the upper directory. - while read -r dir ; do - - # Skip if we are not dealing with a directory here. - # This way we don't remove files on the upper directory - # if a tar archive doesn't need directory stripping. - [ -d "${dir#.}" ] || continue - - # Change into the directory in a subshell so we don't - # need to cd back to the upper directory. - ( - cd "$dir" - - # We use find because we want to move hidden files - # as well. - # - # Skip the file if it has the same name as the directory. - # We will deal with it later. - # - # Word splitting is intentional here. - # shellcheck disable=2046 - find . \( ! -name . -prune \) ! -name "$dir" \ - -exec mv -f {} .. \; - - # If a file/directory with the same name as the directory - # exists, append a '.cptbak' to it and move it to the - # upper directory. - ! [ -e "$dir" ] || mv "$dir" "../${dir}.cptbak" - ) - rmdir "$dir" - - # If a backup file exists, move it into the original location. - ! [ -e "${dir}.cptbak" ] || mv "${dir}.cptbak" "$dir" - done - - # Clean up the temporary tarball. - rm -f .ktar - ;; + tar_extract "$src_dir/$1/${src##*/}" ;; *://*.cpio|*://*.cpio.??|*://*.cpio.???|*://*.cpio.????) - decompress "$src_dir/$1/${src##*/}" | cpio -i - - ;; + decompress "$src_dir/$1/${src##*/}" | pax -r ;; *://*.zip) unzip "$src_dir/$1/${src##*/}" || @@ -1961,6 +1971,12 @@ create_cache() { # you value performance. tar=$(command -v bsdtar || command -v gtar) || tar=tar + # Prefer libarchive tar, GNU tar, or the POSIX defined pax for tarball + # extraction, as they can strip components, which is much much faster than + # our portability function. Our first preference is pax, because it is + # actually slightly faster than bsdtar and GNU tar. + extract=$(command -v pax || command -v "$tar") + # Figure out which 'sudo' command to use based on the user's choice or # what is available on the system. su=${CPT_SU:-$(command -v sls || |