Linux file statistics
The snippet can be accessed without any authentication.
Authored by
remy
Little script to extract files statistics using mainly awk / miller or datamash if miller not found.
.gitignore 55 B
awk_stats_on_1st_field.awk 519 B
files_statistics.sh 19.28 KiB
#!/bin/bash
#
#
#
#set -x
VERSION="0.1.1"
usage() {
echo "$0 [-d directory][-a [ALL|AGE|ACCESS_TIME|MODIF_TIME|SIZE]][-K][-F][-u][-g][-c|-b][-q][-Q][-G <user/group/both/filetype/all>][-v][-h]
[-d <dir>]: the directory to analyze (default is '.'),
[-a <COMPUTE>]: the type of analyze to perform (default is 'ALL' [*]),
[-K]: keep previous results (default is to clean previous results),
[-F]: use the file command for each find results (time consumming++; default is to not do it),
[-u]: add the user to every find results (default is to not do it),
[-g]: add the group to every find results (default is to not do it),
[-c]: add the creation time to every find results (default is to not do it).
[-b]: same as -c (birth time)
[-q]: do not display any notice/warning informations,
[-Q]: just performs operations. Do not display anything.
[-G <user|group|both|filetype/all>]: display statistics grouped by user (implies -u), group(implies -g), both (implies -ug),
filetype (implies -F), all (implies -Fug), or none.
Default is to not group the results (none).
[-v]: print version and exit
[-h]: print this help and exit
[*]: ALL is equivalent to AGE[**] + SIZE statistics
[**]: AGE = ACCESS_TIME + MODIF_TIME statistics"
}
version() {
echo ${VERSION};
}
KEEP=false
QUIET=false
HIDE=false
WITH_FILE_COMMAND=false
WITH_USER=false
WITH_GROUP=false
WITH_CREATION_TIME=false
GROUPED_BY="none"
while getopts ":h:d:a:KFugcbqQG:v" opt; do
case $opt in
a)
_COMPUTE=${OPTARG}
;;
d)
_DIR=${OPTARG}
;;
K)
KEEP=true
;;
u)
WITH_USER=true
;;
g)
WITH_GROUP=true
;;
c|b)
WITH_CREATION_TIME=true
;;
F)
WITH_FILE_COMMAND=true
;;
Q)
HIDE=true
;;
q)
QUIET=true
;;
G)
GROUPED_BY=${OPTARG}
;;
v)
version
exit 0
;;
*|h)
usage
exit 0
;;
esac
done
shift "$((OPTIND-1))"
if [[ -z "${_COMPUTE}" ]]; then
_COMPUTE="ALL"
fi
if [[ -z "${_DIR}" ]]; then
_DIR="."
fi
check_commands() {
# checking if miller is installed
_MLR=$(command -v mlr)
# checking if datamash is installed
_DATAMASH=$(command -v datamash)
# checking if GNU awk is installed
_GAWK=$(command -v awk)
}
backup() {
_DATE=$(date +"%Y%m%d+%H%M%S")
mv all_files_reltimes.tsv all_files_reltimes.tsv."${_DATE}" 2>/dev/null
mv all_files_timestamps.csv all_files_timestamps.csv."${_DATE}" 2>/dev/null
mv all_files_timestamps_space_replaced.csv all_files_timestamps_space_replaced.csv."${_DATE}" 2>/dev/null
mv all_files_timestamps_space_replaced.csv all_files_special_chars_replaced.csv."${_DATE}" 2>/dev/null
mv files_usage_stats.csv files_usage_stats.csv."${_DATE}" 2>/dev/null
mv files_access_stats.csv files_access_stats.csv."${_DATE}" 2>/dev/null
mv files_modifs_stats.csv files_modifs_stats.csv."${_DATE}" 2>/dev/null
for _Analyze_op in "ALL" "AGE" "SIZE" "ACCESS_TIME" "MODIF_TIME"; do
mv files_${_Analyze_op}_stats.out files_${_Analyze_op}_stats.out."${_DATE}" 2>/dev/null
done
}
clean() {
rm all_files_reltimes.tsv 2>/dev/null
rm all_files_timestamps{,_space_replaced}.csv 2>/dev/null
rm all_files_special_chars_replaced.csv 2>/dev/null
rm files_{usage,access,modifs}_stats.csv 2>/dev/null
for _Analyze_op in "ALL" "AGE" "SIZE" "ACCESS_TIME" "MODIF_TIME"; do
rm files_${_Analyze_op}_stats.out 2>/dev/null
done
}
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
check_commands
if [ -z "${_GAWK}" ];then
echo "Sorry, but GNU awk is required to ruen this program... Exiting"
exit 1
fi
if [ "${KEEP}" == true ]; then
backup
else
clean
fi
# End Of find Command "EOC"
# using ::/:: as separator to avoid problematic filenames
# ('/' is a forbidden character for files on linux)
EOC="%p::/::%s::/::%A@::/::%T@::/::__birth__::/::__user__::/::__group__"
NL="\n"
if [[ "${WITH_CREATION_TIME}" == true ]]; then
EOC="${EOC//__birth__/%B@}"
fi
if [[ "${WITH_USER}" == true ]] || [[ "${GROUPED_BY}" == "user" ]] || [[ "${GROUPED_BY}" == "both" ]] || [[ "${GROUPED_BY}" == "all" ]]; then
EOC="${EOC//__user__/%u}"
fi
if [[ "${WITH_GROUP}" == true ]] || [[ "${GROUPED_BY}" == "group" ]] || [[ "${GROUPED_BY}" == "both" ]] || [[ "${GROUPED_BY}" == "all" ]]; then
EOC="${EOC//__group__/%g}"
fi
if [[ "${WITH_FILE_COMMAND}" == true ]] || [[ "${GROUPED_BY}" == "filetype" ]] || [[ "${GROUPED_BY}" == "all" ]]; then
find "${_DIR}" -type f -printf "${EOC}::/::" -exec file -b {} \; > all_files_timestamps.csv
else
# the file command has a NL character even with "-b" (brief)
find "${_DIR}" -type f -printf "${EOC}""${NL}" > all_files_timestamps.csv
fi
CUR_EPOCH=$(date +%s)
EPOCH_DAYS=$(( "${CUR_EPOCH}"/(3600*24) ))
if [[ "${HIDE}" == false ]] && [[ "${QUIET}" == false ]]; then
echo "Current timestamp: ${CUR_EPOCH} seconds since EPOCH (01/01/1970)"
echo "That is ${EPOCH_DAYS} days. If a field displays ${EPOCH_DAYS}[,.]*, then you may have a ~'noatime' mounted filesystem."
echo "+~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~+"
echo '|/!\ Replacing all space characters in filenames with "-.sp.-"; |'
echo '| + Replacing all " characters in filenames with "--.dq.--"; |'
echo '| Files are not renamed, only the filenames used by this script in its temporary {*.csv,*.tsv} files. |'
echo '| If needed, do a diff between "all_files_timestamps.csv" and "all_files_special_chars_replaced.csv". |'
echo '|[ Command ===>>> "diff all_files_timestamps.csv all_files_special_chars_replaced.csv" ]|'
echo "'~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~'"
fi
# removing spaces
sed -re "s/\s/-.sp.-/g" all_files_timestamps.csv > all_files_timestamps_space_replaced.csv
# removing double quotes from filenames
sed -e 's/"/-.dq.-/g' all_files_timestamps_space_replaced.csv > all_files_special_chars_replaced.csv
echo -e "Filename\tSize\tLast access (Days)\tLast modif. (Days)\tBirth date (Days)\tuser\tgroup\tFiletype" > all_files_reltimes.tsv
${_GAWK} -F"::/::" -v OFS="\t" -v EPOCH="${CUR_EPOCH}" -v OFMT="%.2f" '{print $1,$2,(EPOCH-$3)/(3600*24),(EPOCH-$4)/(3600*24),(EPOCH-$5)/(3600*24),$6,$7,$8;}' all_files_special_chars_replaced.csv >> all_files_reltimes.tsv
miller_f () {
stat_results="files_${_COMPUTE}_stats.out"
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "Top 5 biggest files:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint sort -nr Size ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}"
else
echo "Top 5 biggest files:" >> "${stat_results}"
${_MLR} --itsv --opprint sort -nr Size ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}"
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "Top 5 oldest accessed files:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint sort -nr "Last access (Days)" ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}"
else
echo "Top 5 oldest accessed files:" >> "${stat_results}"
${_MLR} --itsv --opprint sort -nr "Last access (Days)" ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}"
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "Top 5 oldest modified files:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint sort -nr "Last modif. (Days)" ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}"
else
echo "Top 5 oldest modified files:" >> "${stat_results}"
${_MLR} --itsv --opprint sort -nr "Last modif. (Days)" ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}"
fi
fi
if [[ "${HIDE}" == false ]] && [[ "${QUIET}" == false ]]; then
echo "Overall statistics..."
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "File size statistics:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,mean,max,sum,median,p25,p75,stddev,count -f Size then put '$Max_GB = $Size_max/(1024**3); $Total_of_files_GB = $Size_sum/(1024**3);' |tee -a "${stat_results}"
else
echo "File size statistics:" >> "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,mean,max,sum,median,p25,p75,stddev,count -f Size then put '$Max_GB = $Size_max/(1024**3); $Total_of_files_GB = $Size_sum/(1024**3);' >> "${stat_results}"
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "File access time statistics:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last access (Days)" |tee -a "${stat_results}"
else
echo "File access time statistics:" >> "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last access (Days)" >> "${stat_results}"
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then
if [[ "${HIDE}" == false ]]; then
echo "File modification time statistics:" |tee -a "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last modif. (Days)" |tee -a "${stat_results}"
else
echo "File modification time statistics:" >> "${stat_results}"
${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last modif. (Days)" >> "${stat_results}"
fi
fi
}
datamash_f() {
basic_sort_f "${QUIET}" "${HIDE}" "${_COMPUTE}"
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then
${_DATAMASH} -H max 2 q1 2 mean 2 q3 2 median 2 sum 2 pstdev 2 min 2 count 2 < all_files_reltimes.tsv > files_usage_stats.csv
if [[ "${HIDE}" == false ]]; then
echo "File size statistics"
${_GAWK} -v OFMT="%.2f" '{if($NF ~ "[[:digit:]]") {print $1/(1024**3)"GB",$2"bytes", $3/1024 "kbytes", $4 "bytes", $5 "bytes", $6/(1024**3)"GB", $7/1024"kbytes", $8 "bytes", $9 "files";} else {print;}}' files_usage_stats.csv | column -t
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then
LC_NUMERIC="en_US.UTF-8" ${_DATAMASH} -H min 3 max 3 q1 3 mean 3 q3 3 median 3 pstdev 3 count 3 < all_files_reltimes.tsv > files_access_stats.csv
if [[ "${HIDE}" == false ]]; then
echo "File access statistics"
column -t files_access_stats.csv
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then
LC_NUMERIC="en_US.UTF-8" ${_DATAMASH} -H min 4 max 4 q1 4 mean 4 q3 4 median 4 pstdev 4 count 4 < all_files_reltimes.tsv > files_modifs_stats.csv
if [[ "${HIDE}" == false ]]; then
echo "File modification statistics"
column -t files_modifs_stats.csv
fi
fi
}
awk_f() {
basic_sort_f "${QUIET}" "${HIDE}" "${_COMPUTE}"
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then
echo "Size statistics (bytes)...."
echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_usage_stats.csv
${_GAWK} '{print $2}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_usage_stats.csv
if [[ "${HIDE}" == false ]]; then
${_GAWK} -v OFMT="%.2f" '{if($NF ~ "[[:digit:]]") {print $1/(1024**3)"GB",$2, $3/1024 "kbytes", $4/1024 "kbytes", $5/1024 "kbytes", $6 "bytes", $7/(1024**3)"GB";} else {print;}}' files_usage_stats.csv | column -t
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then
echo "Access time statistics...."
echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_access_stats.csv
${_GAWK} '{print $3}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_access_stats.csv
if [[ "${HIDE}" == false ]]; then
column -t files_access_stats.csv
fi
fi
if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then
echo "Modif. time statistics...."
echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_modifs_stats.csv
${_GAWK} '{print $4}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_modifs_stats.csv
if [[ "${HIDE}" == false ]]; then
column -t files_modifs_stats.csv
fi
fi
}
basic_sort_f() {
stat_results="files_${3}_stats.out"
if [[ "${3}" == "ALL" ]] || [[ "${3}" == "SIZE" ]]; then
if [[ "${2}" == false ]]; then
echo "Top 5 biggest files:" |tee -a "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${stat_results}"
else
echo "Top 5 biggest files:" >> "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${stat_results}"
fi
fi
if [[ "${3}" == "ALL" ]] || [[ "${3}" == "AGE" ]] || [[ "${3}" == "ACCESS_TIME" ]]; then
if [[ "${2}" == false ]]; then
echo "Top 5 oldest accessed files:" |tee -a "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k3 |head -5 |tee -a "${stat_results}"
else
echo "Top 5 oldest accessed files:" >> "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k3 |head -5 |tee -a "${stat_results}" >> "${stat_results}"
fi
fi
if [[ "${3}" == "ALL" ]] || [[ "${3}" == "AGE" ]] || [[ "${3}" == "MODIF_TIME" ]]; then
if [[ "${2}" == false ]]; then
echo "Top 5 oldest modified files:" |tee -a "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k4 |head -5 |tee -a "${stat_results}"
else
echo "Top 5 oldest modified files:" >> "${stat_results}"
tail -n +2 all_files_reltimes.tsv |sort -nr -k4 |head -5 >> "${stat_results}"
fi
fi
}
count_users() {
if [[ "${2}" == false ]]; then
echo "Top 5 users:" |tee -a "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=6 && NR>1) b[$6]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}"
else
echo "Top 5 filetypes:" >> "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=6 && NR>1) b[$6]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}"
fi
}
count_groups() {
if [[ "${2}" == false ]]; then
echo "Top 5 groups:" |tee -a "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=7 && NR>1) b[$7]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}"
else
echo "Top 5 filetypes:" >> "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=7 && NR>1) b[$7]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}"
fi
}
count_filetypes() {
if [[ "${2}" == false ]]; then
echo "Top 5 filetypes:" |tee -a "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=8 && NR>1) b[$8]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}"
else
echo "Top 5 filetypes:" >> "${3}"
${_GAWK} 'NR==1 {next}; {(NF>=8 && NR>1) b[$8]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}"
fi
}
if [[ -n ${_MLR} ]]; then
GROUP_CMD=""
stat_results="files_${_COMPUTE}_stats.out"
if [[ "${GROUPED_BY}" == "both" ]]; then
GROUP_CMD="then group-by user,group"
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "user" ]]; then
GROUP_CMD="then group-by user"
count_users "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "group" ]]; then
GROUP_CMD="then group-by group"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "filetype" ]]; then
GROUP_CMD="then group-by Filetype"
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "all" ]]; then
GROUP_CMD="then group-by user,group,Filetype"
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
fi
miller_f "${GROUP_CMD}"
elif [[ -n ${_DATAMASH} ]]; then
datamash_f
stat_results="files_${_COMPUTE}_stats.out"
if [[ "${GROUPED_BY}" == "both" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "user" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "group" ]]; then
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "filetype" ]]; then
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "all" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
fi
else
awk_f
stat_results="files_${_COMPUTE}_stats.out"
if [[ "${GROUPED_BY}" == "both" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "user" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "group" ]]; then
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "filetype" ]]; then
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
elif [[ "${GROUPED_BY}" == "all" ]]; then
count_users "${QUIET}" "${HIDE}" "${stat_results}"
count_groups "${QUIET}" "${HIDE}" "${stat_results}"
count_filetypes "${QUIET}" "${HIDE}" "${stat_results}"
fi
fi
Please register or sign in to comment