From 16a2593e30b71f06f4232aa7dd62cc2c93fcd562 Mon Sep 17 00:00:00 2001 From: "Mark (he/his) C. Miller" Date: Tue, 18 Apr 2023 00:42:41 -0700 Subject: [PATCH 01/81] Create check-published-links-weekly.yml --- .../check-published-links-weekly.yml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/check-published-links-weekly.yml diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml new file mode 100644 index 0000000000..3af9b3d3ae --- /dev/null +++ b/.github/workflows/check-published-links-weekly.yml @@ -0,0 +1,40 @@ +name: Check published links weekly +on: + # Run every Sunday evening. + schedule: + # 5:17 AM every Sunday + - cron: '17 5 * * 0' + # Allow for manually running also. + workflow_dispatch: + +jobs: + urlcheck: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: URL Checker + + uses: urlstechie/urlchecker-action@0.0.34 + with: + # A comma-separated list of file types to cover in the URL checks + file_types: .md + + # Choose whether to include file with no URLs in the prints. + print_all: false + + # More verbose summary at the end of a run + verbose: true + + # How many times to retry a failed request (defaults to 1) + retry_count: 3 + + # Google Forms is having enormous timeouts + timeout: 10 + + # Exclude these patterns from the checker + exclude_patterns: http://localhost:4000 + + # Exclude these files from the checker + exclude_files: _config.yml,.github/workflows From d73e0d1265f5a524bf6f8f0f918b9de0f8e84954 Mon Sep 17 00:00:00 2001 From: "Mark (he/his) C. Miller" Date: Tue, 18 Apr 2023 09:47:55 -0700 Subject: [PATCH 02/81] skip files in utils dirs --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 3af9b3d3ae..3448248cf3 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -37,4 +37,4 @@ jobs: exclude_patterns: http://localhost:4000 # Exclude these files from the checker - exclude_files: _config.yml,.github/workflows + exclude_files: docs/_config.yml,.github/workflows,utils From 90129d9040cde8cbd2f541858e1822f974f9545d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 18 Apr 2023 09:56:56 -0700 Subject: [PATCH 03/81] fix some links --- Articles/Blog/2020-01-usrse.md | 4 +--- Articles/Blog/2021-09-CollegevilleReportDay1.md | 2 +- Articles/Blog/2021-12-sc21-swe-cse-bof.md | 2 +- Articles/Blog/Covid19WorkstationCleanliness.md | 8 ++++---- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Articles/Blog/2020-01-usrse.md b/Articles/Blog/2020-01-usrse.md index 207bed46b5..e81995a83c 100644 --- a/Articles/Blog/2020-01-usrse.md +++ b/Articles/Blog/2020-01-usrse.md @@ -31,9 +31,7 @@ fundamental part of their work. In the United States and globally, over the past two to three years (except the UK, which seems to be a few years ahead of the trend), the use of the term RSE has exploded. The RSE role was formally recognized in 2019 by the European Commission in a report -“[Recognising the Importance of Software in Research - Research Software -Engineers (RSEs), a UK -Example](https://ec.europa.eu/info/sites/info/files/research_and_innovation/importance_of_software_in_research.pdf)” +“[Recognising the Importance of Software in Research - Research Software Engineers (RSEs), a UK Example](https://op.europa.eu/s/yA6b)” and by UNESCO in the report “[Paris Call: Software Source Code as Heritage for Sustainable Development](https://en.unesco.org/foss/paris-call-software-source-code)”. As an diff --git a/Articles/Blog/2021-09-CollegevilleReportDay1.md b/Articles/Blog/2021-09-CollegevilleReportDay1.md index bb96ad627d..3f10968865 100644 --- a/Articles/Blog/2021-09-CollegevilleReportDay1.md +++ b/Articles/Blog/2021-09-CollegevilleReportDay1.md @@ -6,7 +6,7 @@ - -#### Contributed by: [Cody Balos](https://github.com/balos1), [Jed Brown](https://github.com/jedbrown), [Gerasimos Chourdakis](https://github.com/MakisH), [Ben Cowan](https://github.com/benc303), [Anshu Dubey](https://github.com/adubey64), [Vadim Dyadechko](https://github.com/vdayadechko), [Robert Jacob](https://github.com/rljacob), [Sarah Knepper](https://github.com/sknepper), [Jay Lofstead](https://github.com/gflofst), [Lois Curfman McInnes](https://github.com/curfman), [Reed Milewicz](https://github.com/rmmilewi), [Jacob Moxley](https://github.com/jmox0351), [Todd Munson](https://github.com/tmunson), [Sarah Osborn](https://github.com/osborn9), [Jim Pivarski](https://github.com/jpivarski), [Elaine Raybourn](https://github.com/elainraybourn), [Barry Smith](https://github.com/BarrySmith), [James Willenbring](https://github.com/jwillenbring), [Ulrike Meier Yang](https://github.com/ulrikeyang), [Sam Yates](https://github.com/halfflat), [Michael A. Heroux](https://github.com/maherou), and [Johanna Cohoon](https://github.com/jlcohoon) +#### Contributed by: [Cody Balos](https://github.com/balos1), [Jed Brown](https://github.com/jedbrown), [Gerasimos Chourdakis](https://github.com/MakisH), [Ben Cowan](https://github.com/benc303), [Anshu Dubey](https://github.com/adubey64), [Vadim Dyadechko](https://github.com/vdyadechko), [Robert Jacob](https://github.com/rljacob), [Sarah Knepper](https://github.com/sknepper), [Jay Lofstead](https://github.com/gflofst), [Lois Curfman McInnes](https://github.com/curfman), [Reed Milewicz](https://github.com/rmmilewi), [Jacob Moxley](https://github.com/jmox0351), [Todd Munson](https://github.com/tmunson), [Sarah Osborn](https://github.com/osborn9), [Jim Pivarski](https://github.com/jpivarski), [Elaine Raybourn](https://github.com/elaineraybourn), [Barry Smith](https://github.com/BarrySmith), [James Willenbring](https://github.com/jwillenbring), [Ulrike Meier Yang](https://github.com/ulrikeyang), [Sam Yates](https://github.com/halfflat), [Michael A. Heroux](https://github.com/maherou), and [Johanna Cohoon](https://github.com/jlcohoon) #### Publication date: September 20, 2021 diff --git a/Articles/Blog/2021-12-sc21-swe-cse-bof.md b/Articles/Blog/2021-12-sc21-swe-cse-bof.md index b82a5214dc..22d174fb04 100644 --- a/Articles/Blog/2021-12-sc21-swe-cse-bof.md +++ b/Articles/Blog/2021-12-sc21-swe-cse-bof.md @@ -4,7 +4,7 @@ - -#### Contributed by: [David E. Bernholdt](https://github.com/bernhold), [Michael Bader](https://github.com/baderml), Michelle Barker, Ben Brown, [Anshu Dubey](https://github.com/adubey64), [Nasir Eisty](https://github.com/neistyS), [Sandra Gesing](https://github.com/sandragesing), [Patricia Grubel](https://github.com/pagrubel), [Rinku Gupta](https://github.com/rinkug), [Michael A. Heroux](https://github.com/maherou), [Saswata Hier-Majumder](https://github.com/sashgeophysics), [Axel Huebl](https://github.com/ax3l), [Mozhgan Kabiri Chimeh](https://github.com/mozhgan-kch), [Daniel S. Katz](https://github.com/danielskatz), [Tomislav Maric](https://github.com/tmaric), [Lois Curfman McInnes](https://github.com/curfman), Bill Miller, [Manish Parashar](https://github.com/parasharmanish), [Ulf D. Schiller](https://github.com/uschille), [Jean Sexton](https://github.com/jmsexton03), [Peter Vaillancourt](https://github.com/sk8forether), Marion Weinzierl, [Yo Yehudi](https://github.com/yochannah) +#### Contributed by: [David E. Bernholdt](https://github.com/bernhold), [Michael Bader](https://github.com/baderml), Michelle Barker, Ben Brown, [Anshu Dubey](https://github.com/adubey64), [Nasir Eisty](https://github.com/neisty), [Sandra Gesing](https://github.com/sandragesing), [Patricia Grubel](https://github.com/pagrubel), [Rinku Gupta](https://github.com/rinkug), [Michael A. Heroux](https://github.com/maherou), [Saswata Hier-Majumder](https://github.com/sashgeophysics), [Axel Huebl](https://github.com/ax3l), [Mozhgan Kabiri Chimeh](https://github.com/mozhgan-kch), [Daniel S. Katz](https://github.com/danielskatz), [Tomislav Maric](https://github.com/tmaric), [Lois Curfman McInnes](https://github.com/curfman), Bill Miller, [Manish Parashar](https://github.com/parasharmanish), [Ulf D. Schiller](https://github.com/uschille), [Jean Sexton](https://github.com/jmsexton03), [Peter Vaillancourt](https://github.com/sk8forether), Marion Weinzierl, [Yo Yehudi](https://github.com/yochannah) #### Publication date: December 21, 2021 diff --git a/Articles/Blog/Covid19WorkstationCleanliness.md b/Articles/Blog/Covid19WorkstationCleanliness.md index cb98a06d33..15eafe1aae 100644 --- a/Articles/Blog/Covid19WorkstationCleanliness.md +++ b/Articles/Blog/Covid19WorkstationCleanliness.md @@ -217,19 +217,19 @@ Aggregate: none
-[1-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/about/transmission.html "CDC guidance on CV-19 transmission" +[1-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/prevent-getting-sick/how-covid-spreads.html "CDC guidance on CV-19 transmission" [2-sfer-ezikiw]: https://www.who.int/health-topics/coronavirus "WHO summary remarks of COVID-19" [3-sfer-ezikiw]: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4659470/ "NIH CV-229E surface study" [4-sfer-ezikiw]: https://www.medrxiv.org/content/10.1101/2020.03.09.20033217v2.full.pdf "Aerosol and surface stability of SARS-CoV-2 compared with SARS-CoV-1" [5-sfer-ezikiw]: https://www.journalofhospitalinfection.com/article/S0195-6701(20)30046-3/fulltext "Surface study of various coronaviruses" [6-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/community/organizations/cleaning-disinfection.html#How%20to%20Clean%20and%20Disinfect "CDC guidance on cleaning surfaces" -[7-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/about/prevention.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdc.gov%2Fcoronavirus%2F2019-ncov%2Fabout%2Fprevention-treatment.html "CDC description of transmission scenario" +[7-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/science/science-briefs/sars-cov-2-transmission.html "CDC description of transmission scenario" [8-sfer-ezikiw]: https://en.wikipedia.org/wiki/Coronavirus "Wikipedia summary of coronavirus" [9-sfer-ezikiw]: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/technical-guidance/naming-the-coronavirus-disease-(covid-2019)-and-the-virus-that-causes-it "CDC explains names SARS-CoV-2 and COVID-19" -[10-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/specific-groups/high-risk-complications.html "CDC describes higher risk persons" +[10-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/need-extra-precautions/people-with-medical-conditions.html "CDC describes higher risk persons" [11-sfer-ezikiw]: https://jamanetwork.com/journals/jama/fullarticle/2762028 "Asymptomatic transmission of COVID-19" [12-sfer-ezikiw]: https://en.wikipedia.org/wiki/Fomite "Explanation of the term Fomite" -[13-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/prepare/cleaning-disinfection.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdc.gov%2Fcoronavirus%2F2019-ncov%2Fcommunity%2Fhome%2Fcleaning-disinfection.html "CDC guidance on fomite transmission" +[13-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/more/science-and-research/surface-transmission.html "CDC guidance on fomite transmission" [14-sfer-ezikiw]: https://learningregistry.org/reviews/best-disinfectant-wipes "Best disinfecting wipes" [15-sfer-ezikiw]: https://www.pcmag.com/how-to/how-to-spring-clean-your-electronics "Best practices for cleaning electronics" [16-sfer-ezikiw]: https://support.apple.com/en-us/HT204172?mod=article_inline "Apple guidance on 70% alcohol" From 025bdbd25a7a03f98117e6301423013c62139426 Mon Sep 17 00:00:00 2001 From: "Mark (he/his) C. Miller" Date: Tue, 18 Apr 2023 12:06:50 -0700 Subject: [PATCH 04/81] Update check-published-links-weekly.yml --- .github/workflows/check-published-links-weekly.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 3448248cf3..0bdbf8f131 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -1,8 +1,7 @@ name: Check published links weekly on: - # Run every Sunday evening. schedule: - # 5:17 AM every Sunday + # 5:17 AM every Sunday - cron: '17 5 * * 0' # Allow for manually running also. workflow_dispatch: From 3ebc92ea6a1fef6680adccd7b3e97e7a08b287fe Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 18 Apr 2023 12:28:48 -0700 Subject: [PATCH 05/81] fix link --- Articles/Blog/ConnectingSoftwareDevelopers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Articles/Blog/ConnectingSoftwareDevelopers.md b/Articles/Blog/ConnectingSoftwareDevelopers.md index 300528f84f..4ab242f907 100644 --- a/Articles/Blog/ConnectingSoftwareDevelopers.md +++ b/Articles/Blog/ConnectingSoftwareDevelopers.md @@ -56,7 +56,7 @@ More details about these two events can be found on the [ORNL Software Expo web Gregory Watson is a Senior Research Scientist in the Research Software Engineering Group at Oak Ridge National Laboratory. He completed his Ph.D. in Computer Science in 2000 from Monash University. Dr. Watson's research interests include software engineering practices, development environments, programming tools, and modeling and simulation tools for high performance and scientific computing. He is founder of the Eclipse Parallel Tools Platform, and project leader of the Eclipse Science Top Level Project. - + Elsa Gonsiorowski is an HPC I/O Specialist working at Lawrence Livermore National Laboratory. She graduated with her Ph.D. in Computer Science in 2016 from Rensselaer Polytechnic Institute. Elsa works on a number of open source, system software tools to support HPC users as they manage files across an increasingly complex storage hierarchy. She has a passion for useful documentation and CMake. -Elsa Gonsiorowski is an HPC I/O Specialist working at Lawrence Livermore National Laboratory. She graduated with her Ph.D. in Computer Science in 2016 from Rensselaer Polytechnic Institute. Elsa works on a number of open source, system software tools to support HPC users as they manage files across an increasingly complex storage hierarchy. She has a passion for useful documentation and CMake. +Elsa Gonsiorowski is an HPC I/O Specialist working at Lawrence Livermore National Laboratory. She graduated with her Ph.D. in Computer Science in 2016 from Rensselaer Polytechnic Institute. Elsa works on a number of open source, system software tools to support HPC users as they manage files across an increasingly complex storage hierarchy. She has a passion for useful documentation and CMake. From 97945895161ca57fa3ffb0fcf418a05cb2c9e6d1 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 18 Apr 2023 17:11:06 -0700 Subject: [PATCH 19/81] no run CI on fork --- .github/workflows/check-published-links-weekly.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index aa05d2495a..2507d3780e 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -7,7 +7,11 @@ on: workflow_dispatch: jobs: + urlcheck: + + if: github.repository_owner == 'betterscientificsoftware' + runs-on: ubuntu-latest steps: From 139349551f7b5244dd0670f3e8c042c42e624890 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 22 Jan 2024 19:54:05 -0800 Subject: [PATCH 20/81] rework as per requests --- .../check-published-links-weekly.yml | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 2507d3780e..20d66c5b50 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -1,26 +1,45 @@ -name: Check published links weekly +name: Check URLs + on: schedule: - # 5:17 AM every Sunday - - cron: '17 5 * * 0' - # Allow for manually running also. - workflow_dispatch: + - cron: '17 5 * * 0' # 5:17 AM every Sunday + pull_request: + branches: [ main ] + workflow_dispatch: # Allows manual triggering jobs: - - urlcheck: - - if: github.repository_owner == 'betterscientificsoftware' - + check-urls: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - name: Checkout Repository + uses: actions/checkout@v3 - - name: URL Checker + - name: Get Changed Files (for PRs) + id: changed-files + if: ${{ github.event_name == 'pull_request' }} + uses: tj-actions/changed-files@v42 + with: + files: **/*.md + files_ignore: | + docs/_config.yml + **/.github/** + **/utils/** + **/docs/** + + - name: Generate list of selected files to URL check + id: file-list + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "::set-output name=files::${{ steps.changed-files.outputs.all_changed_files }}" + else + echo "::set-output name=files::$(find . -name .github -prune -false -o -name utils -prune -false -o -name docs -prune -false -name '*.md')" + fi + - name: Check URLs in selected files uses: urlstechie/urlchecker-action@0.0.34 with: + # A comma-separated list of file types to cover in the URL checks file_types: .md @@ -39,5 +58,9 @@ jobs: # Exclude these patterns from the checker exclude_patterns: http://localhost:4000,https://preview.bssw.io,https://github.com/ - # Exclude these files from the checker - exclude_files: docs/_config.yml,.github,utils,docs,Events + # Operate only on the specific list of files selected above + include_files: "${{ steps.file-list.outputs.files }}" + +# Description: +# 5:17 AM every Sunday +# Allow for manually running also. From 985875e173024052c99f299e075552b3c7345f41 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 22 Jan 2024 19:55:27 -0800 Subject: [PATCH 21/81] fix yaml error --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 20d66c5b50..ca55919d34 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -16,8 +16,8 @@ jobs: uses: actions/checkout@v3 - name: Get Changed Files (for PRs) - id: changed-files if: ${{ github.event_name == 'pull_request' }} + id: changed-files uses: tj-actions/changed-files@v42 with: files: **/*.md From f8bd433f5eb1e4eff88ecb3e6c947c2eaf047a31 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 22 Jan 2024 20:00:30 -0800 Subject: [PATCH 22/81] re-arrange triggers --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index ca55919d34..8ac510b327 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -1,11 +1,11 @@ name: Check URLs on: + workflow_dispatch: # Allows manual triggering schedule: - cron: '17 5 * * 0' # 5:17 AM every Sunday pull_request: branches: [ main ] - workflow_dispatch: # Allows manual triggering jobs: check-urls: From aeafe2d3d154c21536dbcc4ac41e9804e4600a5c Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 22 Jan 2024 20:03:24 -0800 Subject: [PATCH 23/81] fix yaml syntax error --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 8ac510b327..d57d884808 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -20,7 +20,7 @@ jobs: id: changed-files uses: tj-actions/changed-files@v42 with: - files: **/*.md + files: **.md files_ignore: | docs/_config.yml **/.github/** From e6b8b23efaff75745c1d67d62e1e76e9ce3835bc Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 22 Jan 2024 20:06:25 -0800 Subject: [PATCH 24/81] fix yaml syntax error --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index d57d884808..c99c4c7026 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -20,7 +20,7 @@ jobs: id: changed-files uses: tj-actions/changed-files@v42 with: - files: **.md + files: '**.md' files_ignore: | docs/_config.yml **/.github/** From 5296721793e0df885a2f8760de93c36add266a1c Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 25 Jan 2024 20:46:11 -0800 Subject: [PATCH 25/81] update from testing --- .../check-published-links-weekly.yml | 65 ++++++++++++++----- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index c99c4c7026..48b73618c0 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -1,46 +1,68 @@ name: Check URLs on: - workflow_dispatch: # Allows manual triggering + workflow_dispatch: schedule: - cron: '17 5 * * 0' # 5:17 AM every Sunday pull_request: branches: [ main ] +env: + ignore_patterns: | + http://localhost:4000 + https://preview.bssw.io + https://github.com/ + ignore_dirs: | + .github + docs + images + utils + ignore_files: | + foo jobs: check-urls: runs-on: ubuntu-latest steps: + - name: Reformat enviornment variables + id: setup_vars + run: | + tmp=$(echo "${{ env.ignore_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') + echo "ignore_patterns=$tmp" >> $GITHUB_OUTPUT + tmp=$(echo "${{ env.ignore_dirs }}" | sed -e 's@\(.*\)@"**/\1/**"@' | tr '\n' ',' | sed -e 's@,"\*\*//\*\*",$@@') + echo "ignore_dirs=$tmp" >> $GITHUB_OUTPUT + - name: Checkout Repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Get Changed Files (for PRs) if: ${{ github.event_name == 'pull_request' }} id: changed-files uses: tj-actions/changed-files@v42 with: + separator: ',' files: '**.md' - files_ignore: | - docs/_config.yml - **/.github/** - **/utils/** - **/docs/** + files_ignore: ${{ steps.setup_vars.outputs.ignore_dirs }} - name: Generate list of selected files to URL check - id: file-list + id: file_list run: | if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "::set-output name=files::${{ steps.changed-files.outputs.all_changed_files }}" + echo "files=${{ steps.changed-files.outputs.all_changed_files }}" >> $GITHUB_OUTPUT else - echo "::set-output name=files::$(find . -name .github -prune -false -o -name utils -prune -false -o -name docs -prune -false -name '*.md')" + fcmd="" + tmp=$(echo "${{ env.ignore_dirs }}" | tr '\n' ' ' | sed -e 's/ $//') + for d in $ignore_dirs; do + fcmd="$fcmd -name $d -prune -o " + done + echo "files=$(find . $fcmd -name '*.md' -print | tr '\n' ',')" >> $GITHUB_OUTPUT fi - name: Check URLs in selected files uses: urlstechie/urlchecker-action@0.0.34 with: - # A comma-separated list of file types to cover in the URL checks + # Work only on markdown files file_types: .md # Choose whether to include file with no URLs in the prints. @@ -56,11 +78,24 @@ jobs: timeout: 10 # Exclude these patterns from the checker - exclude_patterns: http://localhost:4000,https://preview.bssw.io,https://github.com/ + exclude_patterns: ${{ steps.setup_vars.outputs.ignore_patterns }} # Operate only on the specific list of files selected above - include_files: "${{ steps.file-list.outputs.files }}" + include_files: "${{ steps.file_list.outputs.files }}" +# # Description: -# 5:17 AM every Sunday -# Allow for manually running also. +# +# Triggers in one of three ways; 1) manually, 2) scheduled weekly Sunday's 5:17 AM +# or 3) pull request +# +# Stores ignore cases in env. variables and then reformats those (because they have +# newlines) into a form that can be digested as inputs to other actions. +# +# Uses changed-files action in the case of a pull request. Otherwise, uses a find +# command but builds up the find command to ignore specified directories. +# +# From either the changed files of a pull request or a find command, generates a +# list of files to examine. Eventually, the string variable holding all the file +# names may get too long. +# From 7b1d0aa28e8575f6d7b0bbfd8ffc6cc5e82c2672 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 25 Jan 2024 21:05:57 -0800 Subject: [PATCH 26/81] fix spelling error --- .github/workflows/check-published-links-weekly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 48b73618c0..b63cad3702 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Reformat enviornment variables + - name: Reformat environment variables id: setup_vars run: | tmp=$(echo "${{ env.ignore_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') From 7c2350e5806da66f637f4250f44a0db193883750 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 27 Jan 2024 17:25:08 -0800 Subject: [PATCH 27/81] fix link to hdf5 --- Articles/Blog/2020-11-PSIP4HDF5.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Articles/Blog/2020-11-PSIP4HDF5.md b/Articles/Blog/2020-11-PSIP4HDF5.md index 418f2321e5..453704860a 100644 --- a/Articles/Blog/2020-11-PSIP4HDF5.md +++ b/Articles/Blog/2020-11-PSIP4HDF5.md @@ -55,7 +55,7 @@ In April of 2020, the [LLNL WSC](https://wci.llnl.gov/about-us/weapon-simulation-and-computing) Research Coordination Council established a $150K contract with [The HDF Group](https://www.hdfgroup.org) (THG), the organization responsible for developing and maintaining -[HDF5](https://portal.hdfgroup.org/display/HDF5/HDF5), to employ PSIP to effect some +[HDF5](https://docs.hdfgroup.org/hdf5/develop/index.html), to employ PSIP to effect some software process improvement goals (IGs). Interested readers are welcome to read the [full report](https://www.osti.gov/biblio/1698291-psip-hdf5pilot-project-final-report). From 6adfe912eb36767933b5bad0bad1118b4804747a Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 27 Jan 2024 17:28:42 -0800 Subject: [PATCH 28/81] fix ideas classic link --- CuratedContent/kitchen-sink-TEST.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CuratedContent/kitchen-sink-TEST.md b/CuratedContent/kitchen-sink-TEST.md index 85f676e45e..ada233a973 100644 --- a/CuratedContent/kitchen-sink-TEST.md +++ b/CuratedContent/kitchen-sink-TEST.md @@ -4,8 +4,8 @@ The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. -- [What is good documentation?](https://ideas-productivity.org/wordpress/wp-content/uploads/2016/04/IDEAS-DocumentationHowToWriteGoodDocumentation-V0.1.pdf) -- [How to write good documentation](HowToWriteGoodDocumentation.md) +- [What is good documentation?](https://bssw.io/blog_posts/writing-good-documentation-for-cse-software) +- [How to write good documentation](HowToWriteGitCommit.md) From 530a7922ef665040606a33f2b52bc49f74f47e2a Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 27 Jan 2024 17:55:07 -0800 Subject: [PATCH 29/81] fix links --- CuratedContent/LanguageReferenceOnLine.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CuratedContent/LanguageReferenceOnLine.md b/CuratedContent/LanguageReferenceOnLine.md index b1f8416fbc..e4b360f2d4 100644 --- a/CuratedContent/LanguageReferenceOnLine.md +++ b/CuratedContent/LanguageReferenceOnLine.md @@ -170,7 +170,7 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [c-ibm]: https://www.ibm.com/docs/en/ssw_ibm_i_71/rzarg/sc097852.pdf [c-ms]: https://docs.microsoft.com/en-us/cpp/c-language/c-language-reference?view=msvc-170 [c-clang]: https://clang.llvm.org -[c-amd]: https://developer.amd.com/amd-aocc/ +[c-amd]: https://www.amd.com/content/dam/amd/en/documents/developer/version-4-1-documents/aocc/aocc-4.1-user-guide.pdf [//]: # (C++ language reference URLs) @@ -178,7 +178,7 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [c++-cray]: https://support.hpe.com/hpesc/public/docDisplay?docId=a00115116en_us&docLocale=en_US&page=The_Cray_Compiling_Environment.html [c++-ibm]: https://www.ibm.com/docs/en/ssw_ibm_i_71/rzarg/sc097852.pdf [c++-ms]: https://docs.microsoft.com/en-us/cpp/cpp/cpp-language-reference?view=msvc-170 -[c++-amd]: https://developer.amd.com/amd-aocc/ +[c++-amd]: https://www.amd.com/content/dam/amd/en/documents/developer/version-4-1-documents/aocc/aocc-4.1-user-guide.pdf [c++-clang]: https://clang.llvm.org/cxx_status.html [//]: # (Fortran language reference URLs) @@ -188,12 +188,12 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [f-intel]: https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/language-reference.html "All Fortran standards 90-18" [f-cray]: https://support.hpe.com/hpesc/public/docDisplay?docId=a00115296en_us&page=About_the_Cray_Fortran_Reference_Manual.html [f-ibm]: https://www.ibm.com/support/pages/system/files/support/swg/swgdocs.nsf/0/7e46ea600b6646d0852579dc00331978/$FILE/langref.pdf -[f-nag]: https://www.nag.com/nagware/np/r70_doc/compiler.pdf +[f-nag]: https://support.nag.com/nagware/np/r71_doc/compiler.pdf [f-gnu]: https://devdocs.io/gnu_fortran/ [//]: # (GPU language reference URLs) -[opencl-amd]: https://rocmdocs.amd.com/en/latest/Programming_Guides/Opencl-programming-guide.html#opencl-programming-guide +[opencl-amd]: https://github.com/KhronosGroup/OpenCL-Guide [opencl-intel]: https://www.intel.com/content/www/us/en/develop/documentation/iocl_rt_ref/top.html [opencl-nvidia]: https://developer.download.nvidia.com/compute/DevZone/docs/html/OpenCL/doc/OpenCL_Programming_Guide.pdf @@ -257,7 +257,7 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [//]: # (Portable Parallelism via Abstract Data) -[ppard-kokkos]: https://kokkos.org/programming-guide/ +[ppard-kokkos]: https://kokkos.org/kokkos-core-wiki/ [ppard-ga]: https://hpc.pnl.gov/globalarrays/documentation.shtml [ppard-legion]: https://legion.stanford.edu/pdfs/legion-manual.pdf [ppard-charm++]: https://charm.readthedocs.io/en/latest/charm++/manual.html @@ -285,7 +285,7 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [api-sftp]: https://access.redhat.com/articles/5594481 [api-scp]: https://www.computerhope.com/unix/scp.htm -[api-hpss]: https://www.hpss-collaboration.org/documents/HPSS_7.5.3_Users_Guide.pdf?#page=9 +[api-hpss]: https://hpss-collaboration.org/wp-content/uploads/2023/09/hpss_10.3_users_guide.pdf?#page=9 [api-gdrive]: https://support.google.com/a/users/answer/9282958?hl=en [api-globus]: https://docs.globus.org/cli/ @@ -330,7 +330,7 @@ Version Control | [Git][api-git]/[Subversion][api-svn]/[GitLab][api-gitlab]/[Git [api-github]: https://docs.github.com/en [api-slurm]: https://slurm.schedmd.com -[api-cobalt]: https://trac.mcs.anl.gov/projects/cobalt/wiki/CommandReference +[api-cobalt]: https://ftp.mcs.anl.gov/pub/cobalt/archive/cobalt-0.95.2-manual.pdf [api-moab]: https://iitj.ac.in/uploaded_docs/cc/HPC_training/mcmuserguide.pdf From 5e0059c6e6024ba3d3a73190f43c9b9c55344f1a Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 27 Jan 2024 20:30:01 -0800 Subject: [PATCH 30/81] fix links --- Articles/Blog/Covid19WorkstationCleanliness.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Articles/Blog/Covid19WorkstationCleanliness.md b/Articles/Blog/Covid19WorkstationCleanliness.md index 2f0498dc54..b12289c124 100644 --- a/Articles/Blog/Covid19WorkstationCleanliness.md +++ b/Articles/Blog/Covid19WorkstationCleanliness.md @@ -292,19 +292,19 @@ Pinned: no ### References -* 1[CDC guidance on CV-19 transmission](https://www.cdc.gov/coronavirus/2019-ncov/about/transmission.html) +* 1[CDC guidance on CV-19 transmission](https://www.cdc.gov/coronavirus/2019-ncov/your-health/about-covid-19.html) * 2[WHO summary remarks of COVID-19](https://www.who.int/health-topics/coronavirus) * 3[NIH CV-229E surface study](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4659470/) * 4[Aerosol and surface stability of SARS-CoV-2 compared with SARS-CoV-1](https://www.medrxiv.org/content/10.1101/2020.03.09.20033217v2.full.pdf) * 5[Surface study of various coronaviruses](https://www.journalofhospitalinfection.com/article/S0195-6701(20)30046-3/fulltext) -* 6[CDC guidance on cleaning surfaces](https://www.cdc.gov/coronavirus/2019-ncov/community/organizations/cleaning-disinfection.html#How%20to%20Clean%20and%20Disinfect) +* 6[CDC guidance on cleaning surfaces](https://www.cdc.gov/mmwr/volumes/69/wr/mm6923e2.htm) * 7[CDC description of transmission scenario](https://www.cdc.gov/coronavirus/2019-ncov/about/prevention.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdc.gov%2Fcoronavirus%2F2019-ncov%2Fabout%2Fprevention-treatment.html) * 8[Wikipedia summary of coronavirus](https://en.wikipedia.org/wiki/Coronavirus) * 9[CDC explains names SARS-CoV-2 and COVID-19](https://www.who.int/emergencies/diseases/novel-coronavirus-2019/technical-guidance/naming-the-coronavirus-disease-(covid-2019)-and-the-virus-that-causes-it) -* 10[CDC describes higher risk persons](https://www.cdc.gov/coronavirus/2019-ncov/specific-groups/high-risk-complications.html) +* 10[CDC describes higher risk persons](https://www.cdc.gov/coronavirus/2019-ncov/need-extra-precautions/people-with-medical-conditions.html) * 11[Asymptomatic transmission of COVID-19](https://jamanetwork.com/journals/jama/fullarticle/2762028) * 12[Explanation of the term Fomite](https://en.wikipedia.org/wiki/Fomite) -* 13[CDC guidance on fomite transmission](https://www.cdc.gov/coronavirus/2019-ncov/prepare/cleaning-disinfection.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdc.gov%2Fcoronavirus%2F2019-ncov%2Fcommunity%2Fhome%2Fcleaning-disinfection.html) +* 13[CDC guidance on fomite transmission](https://stacks.cdc.gov/view/cdc/104762) * 14[Best disinfecting wipes](https://learningregistry.org/reviews/best-disinfectant-wipes) * 15[Best practices for cleaning electronics](https://www.pcmag.com/how-to/how-to-spring-clean-your-electronics) * 16[Apple guidance on 70% alcohol](https://support.apple.com/en-us/HT204172?mod=article_inline) From bd8c3a034b626c9fb97fb3fd734233e1817d89a3 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 27 Jan 2024 20:41:31 -0800 Subject: [PATCH 31/81] fix link --- Site/BSSwFellowshipProgram/People/2020-F-Eisty.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Site/BSSwFellowshipProgram/People/2020-F-Eisty.md b/Site/BSSwFellowshipProgram/People/2020-F-Eisty.md index a415fc0991..3ecb18f6bc 100644 --- a/Site/BSSwFellowshipProgram/People/2020-F-Eisty.md +++ b/Site/BSSwFellowshipProgram/People/2020-F-Eisty.md @@ -6,7 +6,7 @@ **Image:** /images/People_2020_F_Eisty.jpg -**LinkedIn:** https://www.linkedin.com/in/nasir-uddin-eisty-492830a9/ +**LinkedIn:** https://www.google.com/search?q=%22Nasir+Eisty%22+site%3Alinkedin.com&client=safari&sca_esv=7002102899f21eb8&sxsrf=ACQVn0-pwZN-S2vGRqD7VkQ0_ZrAyKhEYw%3A1706416704187&source=hp&ei=QNq1ZfOwCdbRkPIPtbqZiAE&iflsig=ANes7DEAAAAAZbXoUAvtXPvugMZ9o5sHPUfngXXNCUH_&ved=0ahUKEwjz0PXLof-DAxXWKEQIHTVdBhEQ4dUDCA4&uact=5&oq=%22Nasir+Eisty%22+site%3Alinkedin.com&gs_lp=Egdnd3Mtd2l6Ih8iTmFzaXIgRWlzdHkiIHNpdGU6bGlua2VkaW4uY29tSNwsUABYzitwAHgAkAEAmAGGAaABmAyqAQQxOS4yuAEDyAEA-AEB-AECwgIEECMYJ8ICBRAAGIAEwgIGEAAYFhgewgIFECEYoAE&sclient=gws-wiz **Github:** https://github.com/neisty From 1a2074f2ec60fddeebe959e2d7973ce2f57fa5f3 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 28 Jan 2024 18:53:00 -0800 Subject: [PATCH 32/81] fix links --- Articles/Blog/Covid19WorkstationCleanliness.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Articles/Blog/Covid19WorkstationCleanliness.md b/Articles/Blog/Covid19WorkstationCleanliness.md index b12289c124..6086fba1d7 100644 --- a/Articles/Blog/Covid19WorkstationCleanliness.md +++ b/Articles/Blog/Covid19WorkstationCleanliness.md @@ -219,7 +219,7 @@ Pinned: no [3-sfer-ezikiw]: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4659470/ "NIH CV-229E surface study" [4-sfer-ezikiw]: https://www.medrxiv.org/content/10.1101/2020.03.09.20033217v2.full.pdf "Aerosol and surface stability of SARS-CoV-2 compared with SARS-CoV-1" [5-sfer-ezikiw]: https://www.journalofhospitalinfection.com/article/S0195-6701(20)30046-3/fulltext "Surface study of various coronaviruses" -[6-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/community/organizations/cleaning-disinfection.html#How%20to%20Clean%20and%20Disinfect "CDC guidance on cleaning surfaces" +[6-sfer-ezikiw]: https://www.cdc.gov/mmwr/volumes/69/wr/mm6923e2.htm "CDC guidance on cleaning surfaces" [7-sfer-ezikiw]: https://www.cdc.gov/coronavirus/2019-ncov/science/science-briefs/sars-cov-2-transmission.html "CDC description of transmission scenario" [8-sfer-ezikiw]: https://en.wikipedia.org/wiki/Coronavirus "Wikipedia summary of coronavirus" [9-sfer-ezikiw]: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/technical-guidance/naming-the-coronavirus-disease-(covid-2019)-and-the-virus-that-causes-it "CDC explains names SARS-CoV-2 and COVID-19" From fc2cc5c1ae840983f0817a6ab27db884f7ee82ed Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 28 Jan 2024 20:10:26 -0800 Subject: [PATCH 33/81] add description --- .github/workflows/check-published-links-weekly.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index b63cad3702..1cc9003032 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -16,6 +16,7 @@ env: docs images utils + Events ignore_files: | foo @@ -90,7 +91,8 @@ jobs: # or 3) pull request # # Stores ignore cases in env. variables and then reformats those (because they have -# newlines) into a form that can be digested as inputs to other actions. +# newlines) into a comma-separated single line string that can be digested as +# inputs to other actions. # # Uses changed-files action in the case of a pull request. Otherwise, uses a find # command but builds up the find command to ignore specified directories. @@ -99,3 +101,9 @@ jobs: # list of files to examine. Eventually, the string variable holding all the file # names may get too long. # +# We include Events in dirs to ignore because of all content we host, I suspect +# Event URLs are the most likely to go stale rather quickly **and** because +# the URL validness is important only during the short window prior to the event. +# That said, we don't want to ignore Events in PRs. +# + From 7d5b88abff9be1335f5e419e6a55b7a39dc68f50 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 31 Jan 2024 16:33:44 -0800 Subject: [PATCH 34/81] final fixes --- .../check-published-links-weekly.yml | 67 +++++++++---------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-published-links-weekly.yml index 1cc9003032..5f9cec9c42 100644 --- a/.github/workflows/check-published-links-weekly.yml +++ b/.github/workflows/check-published-links-weekly.yml @@ -7,18 +7,15 @@ on: pull_request: branches: [ main ] env: - ignore_patterns: | + ignore_url_patterns: | http://localhost:4000 https://preview.bssw.io https://github.com/ - ignore_dirs: | - .github - docs - images - utils - Events - ignore_files: | - foo + ignore_file_patterns: | + docs/ + images/ + utils/ + Events/ jobs: check-urls: @@ -28,10 +25,10 @@ jobs: - name: Reformat environment variables id: setup_vars run: | - tmp=$(echo "${{ env.ignore_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') - echo "ignore_patterns=$tmp" >> $GITHUB_OUTPUT - tmp=$(echo "${{ env.ignore_dirs }}" | sed -e 's@\(.*\)@"**/\1/**"@' | tr '\n' ',' | sed -e 's@,"\*\*//\*\*",$@@') - echo "ignore_dirs=$tmp" >> $GITHUB_OUTPUT + tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') + echo "ignore_url_patterns=$tmp" >> $GITHUB_OUTPUT + tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') + echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT - name: Checkout Repository uses: actions/checkout@v4 @@ -42,21 +39,16 @@ jobs: uses: tj-actions/changed-files@v42 with: separator: ',' - files: '**.md' - files_ignore: ${{ steps.setup_vars.outputs.ignore_dirs }} - - name: Generate list of selected files to URL check + - name: Generate lists of files to check and ignore id: file_list run: | if [ "${{ github.event_name }}" = "pull_request" ]; then echo "files=${{ steps.changed-files.outputs.all_changed_files }}" >> $GITHUB_OUTPUT + echo "ignore_file_patterns=" >> $GITHUB_OUTPUT else - fcmd="" - tmp=$(echo "${{ env.ignore_dirs }}" | tr '\n' ' ' | sed -e 's/ $//') - for d in $ignore_dirs; do - fcmd="$fcmd -name $d -prune -o " - done - echo "files=$(find . $fcmd -name '*.md' -print | tr '\n' ',')" >> $GITHUB_OUTPUT + echo "files=" >> $GITHUB_OUTPUT + echo "ignore_file_patterns= ${{ steps.setup_vars.outputs.ignore_file_patterns }}" >> $GITHUB_OUTPUT fi - name: Check URLs in selected files @@ -79,7 +71,10 @@ jobs: timeout: 10 # Exclude these patterns from the checker - exclude_patterns: ${{ steps.setup_vars.outputs.ignore_patterns }} + exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} + + # Exclude these dirs and files + exclude_files: "${{ steps.file_list.outputs.ignore_file_patterns }}" # Operate only on the specific list of files selected above include_files: "${{ steps.file_list.outputs.files }}" @@ -90,20 +85,22 @@ jobs: # Triggers in one of three ways; 1) manually, 2) scheduled weekly Sunday's 5:17 AM # or 3) pull request # -# Stores ignore cases in env. variables and then reformats those (because they have -# newlines) into a comma-separated single line string that can be digested as -# inputs to other actions. +# Stores ignore pattern cases in env. variables and then reformats those (because +# they have newlines) into a comma-separated single line string that can be digested +# as inputs to other actions. # -# Uses changed-files action in the case of a pull request. Otherwise, uses a find -# command but builds up the find command to ignore specified directories. +# For PRs, uses changed-files action to get list of changed files and passes this +# to urlchecker via `include_files param. Also, ignore file patterns is set to +# empty string for PRs because we think URLs anywhere in PRs should be checked. # -# From either the changed files of a pull request or a find command, generates a -# list of files to examine. Eventually, the string variable holding all the file -# names may get too long. +# For scheduled or manual triggers, uses fact that empty `include_files` param +# causes urlchecker to process *all* files that match in `file_type` param but do +# not match any `exclude_files` patterns. These file patterns for exclude work +# more or less like file globs. So, specifying the initial part of the string +# for a file (path) name is sufficient to ignore the file. # -# We include Events in dirs to ignore because of all content we host, I suspect -# Event URLs are the most likely to go stale rather quickly **and** because +# We include Events in file patterns to ignore because of all content we host, +# we suspect Event URLs are the most likely to go stale rather quickly **and** because # the URL validness is important only during the short window prior to the event. -# That said, we don't want to ignore Events in PRs. +# That said, we don't want to ignore Events in PRs and we do not as per above. # - From 77c78dbde723741cb94d0ea6664811542c56336d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 31 Jan 2024 16:36:11 -0800 Subject: [PATCH 35/81] add check-urls --- .github/workflows/README.md | 4 ++++ .../{check-published-links-weekly.yml => check-urls.yml} | 0 2 files changed, 4 insertions(+) rename .github/workflows/{check-published-links-weekly.yml => check-urls.yml} (100%) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 7bc2b1e989..bd3e9632e1 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -30,6 +30,10 @@ Format: - https://github.com/betterscientificsoftware/bssw.io/blob/0e1ba1664239ab6097e903f8bb94fef08576f103/.github/workflows/notify-external-contrib.yml#L6-L12 - job: notify-external-contributions - Label and send email to bssw-editorial-list for externally opened issues, prs and discussions +* check-urls.yml (Check URLs) + - trigger: manual, scheduled, pull-request + - checks URLs in a PR or in whole repo. + # Gaps * PR is closed without merge. We should back out the whole PR from preview? Or kill and recreate preview? diff --git a/.github/workflows/check-published-links-weekly.yml b/.github/workflows/check-urls.yml similarity index 100% rename from .github/workflows/check-published-links-weekly.yml rename to .github/workflows/check-urls.yml From e419e4bd05b67593982b7e3278575184114f543e Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 4 Feb 2024 10:47:02 -0800 Subject: [PATCH 36/81] test new checkere --- .github/workflows/check-urls.yml | 2 +- utils/wikize_refs.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 5f9cec9c42..eafcb7b335 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -52,7 +52,7 @@ jobs: fi - name: Check URLs in selected files - uses: urlstechie/urlchecker-action@0.0.34 + uses: urlstechie/urlchecker-action@update/0.0.35 with: # Work only on markdown files diff --git a/utils/wikize_refs.py b/utils/wikize_refs.py index 731480f217..0243134fed 100755 --- a/utils/wikize_refs.py +++ b/utils/wikize_refs.py @@ -213,7 +213,7 @@ def broken_link(x, timeout=20): x.startswith('#'): return False - req = Request(x, None, {'User-Agent': broken_link.agent}) + req = Request(x, None, {'User-Agent': broken_link.agent}, 'HEAD') try: resp = urlopen(req, None, timeout) @@ -221,13 +221,25 @@ def broken_link(x, timeout=20): status = resp.getcode() except: status = resp.status + print(x, status) if status in [404,408,409,501,502,503]: return True else: return False except: + print(x, "Excepted") return True + + + + + + + + + + def diff_and_keep_sorted(l1, l2): """Difference two lists. If result is all ints, sort numerically. Otherwise sort lexicographically.""" @@ -512,9 +524,9 @@ def error_checks(file_lines, fn_handles, ref_map, check_links, has_lddbs): if check_links: for k in ref_map: url = ref_map[k][0] - if not valid_url(url): - message("Invalid URL: \"%s\""%url) - elif broken_link(url, check_links): + #if not valid_url(url): + # message("Invalid URL: \"%s\""%url) + if broken_link(url, check_links): message("Broken URL: \"%s\""%url) return missing_refs From 5ad20571857382287758411d626ae4d462fd8cf2 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 4 Feb 2024 10:53:40 -0800 Subject: [PATCH 37/81] add no-check-certs --- .github/workflows/check-urls.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index eafcb7b335..1a2b2bbc8e 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -70,6 +70,9 @@ jobs: # Google Forms is having enormous timeouts timeout: 10 + # Ignore certificate issues + no_check_certs: True + # Exclude these patterns from the checker exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} From cfa5b8d202b10638742403359cf8b29cda7bd30e Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 4 Feb 2024 16:05:05 -0800 Subject: [PATCH 38/81] fix links to pdfs in swe-cse-bof --- Articles/Blog/2021-12-sc21-swe-cse-bof.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Articles/Blog/2021-12-sc21-swe-cse-bof.md b/Articles/Blog/2021-12-sc21-swe-cse-bof.md index bda4aaf2f3..3dfaab2be4 100644 --- a/Articles/Blog/2021-12-sc21-swe-cse-bof.md +++ b/Articles/Blog/2021-12-sc21-swe-cse-bof.md @@ -16,12 +16,12 @@ We’ve settled on a format for the BoF that includes 3-minute lightning talks, We had six lightning talks this year, covering a wide range of topics with speakers from the US, UK, Germany, and Australia. -* [Ecosystems are the Future!](https://betterscientificsoftware.github.io/swe-cse-bof/2021-11-sc21-bof/01-brown-ecosystems.pdf) by Benjamin Brown (Office of Advanced Scientific Computing Research (ASCR), Office of Science, U.S. Dept. of Energy). Ben discussed the importance of ecosystems in his vision for the future of the ASCR high performance computing and networking user facilities ([ALCF](https://www.alcf.anl.gov/), [NERSC](https://www.nersc.gov/), [OLCF](https://www.olcf.ornl.gov/), and[ ESnet](https://www.es.net/)) and highlighted scientific software as one of the key ecosystems. He closed with the message “Software ecosystems are research infrastructure!” +* [Ecosystems are the Future!](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/01-brown-ecosystems.pdf) by Benjamin Brown (Office of Advanced Scientific Computing Research (ASCR), Office of Science, U.S. Dept. of Energy). Ben discussed the importance of ecosystems in his vision for the future of the ASCR high performance computing and networking user facilities ([ALCF](https://www.alcf.anl.gov/), [NERSC](https://www.nersc.gov/), [OLCF](https://www.olcf.ornl.gov/), and[ ESnet](https://www.es.net/)) and highlighted scientific software as one of the key ecosystems. He closed with the message “Software ecosystems are research infrastructure!” * [Open Source for Researchers](https://doi.org/10.5281/zenodo.5655022) by Yo Yehudi (Wellcome Trust). Yo encouraged us to up our game for our open source software projects, making them more open and more accessible to others. She touched on readme files, roadmaps, contributor guides, codes of conduct, a requested citation, contact information, and using an issue tracker and also suggested some resources to help folks get started contributing to open source software. -* [The Internat. CSE Master Program at TUM](https://betterscientificsoftware.github.io/swe-cse-bof/2021-11-sc21-bof/03-bader-masterprogram.pdf) by Michael Bader (Technical University of Munich, TUM). Michael described the International Master’s program in Computational Science and Engineering, which has been offered at TUM since 2001, currently serving approximately 50 students per year. The 4-semester program combines classes in computer science, numerical analysis, and scientific computing - with one of the key challenges being how to guide students from various backgrounds towards becoming experts in software development for supercomputing applications. -* [Senior Level RSE career paths (with an s)](https://betterscientificsoftware.github.io/swe-cse-bof/2021-11-sc21-bof/04-katz-seniorrse.pdf) by Daniel S. Katz (University of Illinois at Urbana-Champaign). Dan presented ideas to help define paths for career progression for research software engineers (RSEs), looking particularly at offering a richer set of opportunities at senior levels to allow RSEs to explore different roles, emphasizing different skills. -* [FAIR 4 Research Software (FAIR4RS)](https://betterscientificsoftware.github.io/swe-cse-bof/2021-11-sc21-bof/05-barker-fair4rs.pdf) by Michelle Barker (Research Software Alliance). Michelle presented the emerging idea of applying the FAIR principles (findability, accessibility, interoperability, reusability) to research software, noting “Software is not just another type of data.” The Research Data Alliance, FORCE11, and the Research Software Alliance are working together to develop the FAIR4RS principles and guidelines for implementing them. -* [Highlights from the IEEE CS Ad Hoc Committee on Open Science & Reproducibility](https://betterscientificsoftware.github.io/swe-cse-bof/2021-11-sc21-bof/06-parashar-openscience.pdf) by Manish Parashar (University of Utah). In 2019, the National Academies of Science, Engineering, and Medicine (NASEM) published a report on Reproducibility and Replicability in Science. Manish described work by the IEEE Computer Society, building upon the NASEM report, to develop an action plan to improve and recognize reproducibility in the society’s publications, conferences, and through its technical committees. +* [The Internat. CSE Master Program at TUM](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/03-bader-masterprogram.pdf) by Michael Bader (Technical University of Munich, TUM). Michael described the International Master’s program in Computational Science and Engineering, which has been offered at TUM since 2001, currently serving approximately 50 students per year. The 4-semester program combines classes in computer science, numerical analysis, and scientific computing - with one of the key challenges being how to guide students from various backgrounds towards becoming experts in software development for supercomputing applications. +* [Senior Level RSE career paths (with an s)](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/04-katz-seniorrse.pdf) by Daniel S. Katz (University of Illinois at Urbana-Champaign). Dan presented ideas to help define paths for career progression for research software engineers (RSEs), looking particularly at offering a richer set of opportunities at senior levels to allow RSEs to explore different roles, emphasizing different skills. +* [FAIR 4 Research Software (FAIR4RS)](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/05-barker-fair4rs.pdf) by Michelle Barker (Research Software Alliance). Michelle presented the emerging idea of applying the FAIR principles (findability, accessibility, interoperability, reusability) to research software, noting “Software is not just another type of data.” The Research Data Alliance, FORCE11, and the Research Software Alliance are working together to develop the FAIR4RS principles and guidelines for implementing them. +* [Highlights from the IEEE CS Ad Hoc Committee on Open Science & Reproducibility](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/06-parashar-openscience.pdf) by Manish Parashar (University of Utah). In 2019, the National Academies of Science, Engineering, and Medicine (NASEM) published a report on Reproducibility and Replicability in Science. Manish described work by the IEEE Computer Society, building upon the NASEM report, to develop an action plan to improve and recognize reproducibility in the society’s publications, conferences, and through its technical committees. ### Breakout discussions From 3373b55cea3b92ea271fc07b17d96931495e4f86 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 25 Mar 2024 09:58:01 -0700 Subject: [PATCH 39/81] ws change to trigger a commit --- .github/workflows/check-urls.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 1a2b2bbc8e..e80a3f33f6 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -107,3 +107,4 @@ jobs: # the URL validness is important only during the short window prior to the event. # That said, we don't want to ignore Events in PRs and we do not as per above. # + From 42e39a9f0456e18e6c204b455b7b4b5fa1b68543 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 14 May 2024 14:31:26 -0700 Subject: [PATCH 40/81] try new link check util --- .github/workflows/check-urls.yml | 53 ++++++++++++++------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index e80a3f33f6..bc6d030314 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -22,6 +22,17 @@ jobs: runs-on: ubuntu-latest steps: + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip --no-cache-dir --disable-pip-version-check install --upgrade pip + python -m pip --no-cache-dir --disable-pip-version-check install linkchecker + - name: Reformat environment variables id: setup_vars run: | @@ -48,39 +59,21 @@ jobs: echo "ignore_file_patterns=" >> $GITHUB_OUTPUT else echo "files=" >> $GITHUB_OUTPUT - echo "ignore_file_patterns= ${{ steps.setup_vars.outputs.ignore_file_patterns }}" >> $GITHUB_OUTPUT + echo "ignore_file_patterns=${{ steps.setup_vars.outputs.ignore_file_patterns }}" >> $GITHUB_OUTPUT fi + #exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} - name: Check URLs in selected files - uses: urlstechie/urlchecker-action@update/0.0.35 - with: - - # Work only on markdown files - file_types: .md - - # Choose whether to include file with no URLs in the prints. - print_all: false - - # More verbose summary at the end of a run - verbose: true - - # How many times to retry a failed request (defaults to 1) - retry_count: 3 - - # Google Forms is having enormous timeouts - timeout: 10 - - # Ignore certificate issues - no_check_certs: True - - # Exclude these patterns from the checker - exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} - - # Exclude these dirs and files - exclude_files: "${{ steps.file_list.outputs.ignore_file_patterns }}" - - # Operate only on the specific list of files selected above - include_files: "${{ steps.file_list.outputs.files }}" + run: | + which linkchecker + for f in ${{ steps.file_list.outputs.files }}; do + for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do + if [ "$ef" == "$f" ]; then + continue 2 # ignore this file + fi + done + echo $f + done # # Description: From 122ba08b431b9e7dcd48fbe5f43a58c9399471a0 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 14 May 2024 19:32:53 -0700 Subject: [PATCH 41/81] apply linkchecker per md file --- .github/workflows/check-urls.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index bc6d030314..c593ea28d0 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -65,14 +65,23 @@ jobs: #exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} - name: Check URLs in selected files run: | - which linkchecker + # Create an rc file controlling behavior of linkchecker + echo " + [csv] + separator=, + parts=urlname,parentname,result + [MarkdownCheck] + filename_re=.*\.md$ + " > .linkcheckerrc + rm linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do if [ "$ef" == "$f" ]; then continue 2 # ignore this file fi done - echo $f + linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f + cat linkchecker-out.csv >> linkchecker-out-all.csv done # From c9f4aa345341a913a9b8776adad1dc7184542a95 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Tue, 14 May 2024 19:33:59 -0700 Subject: [PATCH 42/81] fix logic error --- .github/workflows/check-urls.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index c593ea28d0..51207a16b6 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,7 +73,6 @@ jobs: [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc - rm linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do if [ "$ef" == "$f" ]; then From f34f557e9b2bed5147e82716ee9b46a5c4ab5744 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 15 May 2024 13:24:55 -0700 Subject: [PATCH 43/81] adjust params in checker --- .github/workflows/check-urls.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 51207a16b6..37d2296aac 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -67,6 +67,10 @@ jobs: run: | # Create an rc file controlling behavior of linkchecker echo " + [checking] + sslverify=0 + maxfilesizedownload=100000 + maxfilesizeparse=100000 [csv] separator=, parts=urlname,parentname,result @@ -74,8 +78,11 @@ jobs: filename_re=.*\.md$ " > .linkcheckerrc for f in ${{ steps.file_list.outputs.files }}; do + if [ "${f##.*}" != "md" ]; then + continue + fi for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do - if [ "$ef" == "$f" ]; then + if [ "$ef" = "$f" ]; then continue 2 # ignore this file fi done From 4d838780dd843ad61304e8582c7905d9a3b8391b Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 15 May 2024 13:27:45 -0700 Subject: [PATCH 44/81] adjust comma seps --- .github/workflows/check-urls.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 37d2296aac..a81a6ecec6 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -36,9 +36,9 @@ jobs: - name: Reformat environment variables id: setup_vars run: | - tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') + tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ' ' | sed -e 's/,,$//') echo "ignore_url_patterns=$tmp" >> $GITHUB_OUTPUT - tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ',' | sed -e 's/,,$//') + tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ' | sed -e 's/,,$//') echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT - name: Checkout Repository From 05e85ddc356fc53ec614a3f7fbba98b02a90e50b Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 15 May 2024 13:30:16 -0700 Subject: [PATCH 45/81] adjust comma seps --- .github/workflows/check-urls.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index a81a6ecec6..e74237f108 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -36,9 +36,9 @@ jobs: - name: Reformat environment variables id: setup_vars run: | - tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ' ' | sed -e 's/,,$//') + tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ' ') echo "ignore_url_patterns=$tmp" >> $GITHUB_OUTPUT - tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ' | sed -e 's/,,$//') + tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ') echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT - name: Checkout Repository @@ -49,7 +49,7 @@ jobs: id: changed-files uses: tj-actions/changed-files@v42 with: - separator: ',' + separator: ' ' - name: Generate lists of files to check and ignore id: file_list From a0f91c26e411d17498cbbbc20a47b34eafc48226 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 15 May 2024 13:35:04 -0700 Subject: [PATCH 46/81] fix syntax for extension --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index e74237f108..470fa5a998 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -78,7 +78,7 @@ jobs: filename_re=.*\.md$ " > .linkcheckerrc for f in ${{ steps.file_list.outputs.files }}; do - if [ "${f##.*}" != "md" ]; then + if [ "${f##*.}" != "md" ]; then continue fi for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do From 9bf7b9af57684e46d46ca80848bd9675c2dd7988 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 15 May 2024 14:03:41 -0700 Subject: [PATCH 47/81] continue loop after failures --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 470fa5a998..5912114475 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -86,7 +86,7 @@ jobs: continue 2 # ignore this file fi done - linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f + linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true cat linkchecker-out.csv >> linkchecker-out-all.csv done From 137f3d084c87115d34c332ae234a1f8eb8e27cdc Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 10:42:47 -0700 Subject: [PATCH 48/81] WS change to trigger action --- .github/workflows/check-urls.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 5912114475..13332b320c 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -115,4 +115,3 @@ jobs: # the URL validness is important only during the short window prior to the event. # That said, we don't want to ignore Events in PRs and we do not as per above. # - From d42614d3c9358be2d155522665ef371a5a721b6d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 11:23:18 -0700 Subject: [PATCH 49/81] add artifact upload --- .github/workflows/check-urls.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 13332b320c..f15203c28b 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -90,6 +90,12 @@ jobs: cat linkchecker-out.csv >> linkchecker-out-all.csv done + - name: Upload artifact + uses: actions/upload-artifact@v3 + with: + name: my-artifact + path: linkchecker-out-all.csv + # # Description: # From ab50688d9325c9789e65a6e80ff9f297c9b9ead6 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:36:46 -0700 Subject: [PATCH 50/81] get all logger parts --- .github/workflows/check-urls.yml | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index f15203c28b..1b5f0e7429 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,10 +73,11 @@ jobs: maxfilesizeparse=100000 [csv] separator=, - parts=urlname,parentname,result + parts=all [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc + echo "urlname,parentname,result > linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do if [ "${f##*.}" != "md" ]; then continue @@ -87,15 +88,35 @@ jobs: fi done linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - cat linkchecker-out.csv >> linkchecker-out-all.csv + # cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200$ >> linkchecker-out-all.csv done - name: Upload artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: my-artifact path: linkchecker-out-all.csv + +# +# Keep the recurring failures and definitely bad lists in repo on +# branch manage-broken-links +# +# Download those files before startin +# +# If a link "works" (200) remove it from "recurring failures" list +# If a link "does not work" (!= 200) +# - if it is already on recurring failures list +# - if it is too old, flag it as "definitely bad", else nothing +# +# - if it is not already on persistent failures list, add it to "new" and "persistent failures" list and date it + +# +# Upload the recurring failures and definitely bad lists to somehwere +# Report success if definitely bad list is empty, otherwise failure +# generate email with links or actual data +# + # # Description: # From ad297bdfc3687a0f183c0310c85607f03e24b722 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:38:30 -0700 Subject: [PATCH 51/81] reset logger parts --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 1b5f0e7429..878674bdfa 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,7 +73,7 @@ jobs: maxfilesizeparse=100000 [csv] separator=, - parts=all + parts=name,parentname,result [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc From 2e673e3bac472a0169e20c340eaf89f783821552 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:40:11 -0700 Subject: [PATCH 52/81] fix missing closing " --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 878674bdfa..4d98e33ebe 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -77,7 +77,7 @@ jobs: [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc - echo "urlname,parentname,result > linkchecker-out-all.csv + #echo "urlname,parentname,result" > linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do if [ "${f##*.}" != "md" ]; then continue From a7c8f34d9cc982fa5ade7f5ad19554837c1f80a8 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:46:37 -0700 Subject: [PATCH 53/81] fix cat operation --- .github/workflows/check-urls.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 4d98e33ebe..4147dc28a9 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -77,7 +77,7 @@ jobs: [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc - #echo "urlname,parentname,result" > linkchecker-out-all.csv + echo "urlname,parentname,result" > linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do if [ "${f##*.}" != "md" ]; then continue @@ -88,7 +88,7 @@ jobs: fi done linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - # cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200$ >> linkchecker-out-all.csv + cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200$ >> linkchecker-out-all.csv done - name: Upload artifact From 5f8d4b733117bc4c68be192126b19738c61d72e4 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:48:46 -0700 Subject: [PATCH 54/81] fix sed filter --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 4147dc28a9..91cae6e0f4 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -88,7 +88,7 @@ jobs: fi done linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200$ >> linkchecker-out-all.csv + cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200.*$' >> linkchecker-out-all.csv done - name: Upload artifact From f09fba10b890aac39549d6fdd64776cba01645b4 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 19:54:35 -0700 Subject: [PATCH 55/81] reset logger parts --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 91cae6e0f4..f7115a6eeb 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,7 +73,7 @@ jobs: maxfilesizeparse=100000 [csv] separator=, - parts=name,parentname,result + parts=urlname,parentname,result [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc From 2fe80dbb5e3a5c284a7ca6c9ff8aa1ea5e187f21 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 20:07:05 -0700 Subject: [PATCH 56/81] fix logger parts --- .github/workflows/check-urls.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index f7115a6eeb..9da12e5ca7 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,11 +73,10 @@ jobs: maxfilesizeparse=100000 [csv] separator=, - parts=urlname,parentname,result + parts=urlname,parentname,warning,result [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc - echo "urlname,parentname,result" > linkchecker-out-all.csv for f in ${{ steps.file_list.outputs.files }}; do if [ "${f##*.}" != "md" ]; then continue @@ -88,7 +87,7 @@ jobs: fi done linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - cat linkchecker-out.csv | sed '1n; /urlname,parentname,result/d' | grep -v ',200.*$' >> linkchecker-out-all.csv + tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv done - name: Upload artifact From 2e7cab20d43b04837a67a5e12edbec0f03196242 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 20:21:24 -0700 Subject: [PATCH 57/81] logger parts all --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 9da12e5ca7..1622453015 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -73,7 +73,7 @@ jobs: maxfilesizeparse=100000 [csv] separator=, - parts=urlname,parentname,warning,result + parts=all [MarkdownCheck] filename_re=.*\.md$ " > .linkcheckerrc From 975021add736e2d40667d1abc81ce6e22d76307d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 23:30:25 -0700 Subject: [PATCH 58/81] use diff sep char --- .github/workflows/check-urls.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 1622453015..31406bf96f 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -72,7 +72,7 @@ jobs: maxfilesizedownload=100000 maxfilesizeparse=100000 [csv] - separator=, + separator=; parts=all [MarkdownCheck] filename_re=.*\.md$ @@ -87,7 +87,8 @@ jobs: fi done linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv + #tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv + tail -n +4 linkchecker-out.csv >> linkchecker-out-all.csv done - name: Upload artifact From 8aa0130d44b0e404ffcae42ba5a8717f73de2e9d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Wed, 26 Jun 2024 23:51:18 -0700 Subject: [PATCH 59/81] test parse fix --- .github/workflows/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index bd3e9632e1..6f1d101247 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -73,6 +73,6 @@ Format: - Doesn't seem to work for PRs from forks * Mark C. Miller has introduced some useful MD-related processing into some of his other projects - There was some stuff document in the About file in earlier versions of the EB-docs. Seems to have been removed now. - - provides an example of what the logs look like when a spelling check fails - - invokes a linter, a spell checker, and a lnk checker + - https://travis-ci.com/github/visit-dav/visit-website/builds/181169664 provides an example of what the logs look like when a spelling check fails + - https://github.com/visit-dav/visit-website/blob/gh-pages/.travis.yml invokes a linter, a spell checker, and a lnk checker * Long ago, Will Mclendon of SNL wrote a script to validate an article's metadata. It would probably be useful to dust that off and update it. From 05e20375d85a68ec918888025b179690015d3817 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 27 Jun 2024 00:15:29 -0700 Subject: [PATCH 60/81] test parse fix --- .github/workflows/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 6f1d101247..ec0535698c 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -73,6 +73,5 @@ Format: - Doesn't seem to work for PRs from forks * Mark C. Miller has introduced some useful MD-related processing into some of his other projects - There was some stuff document in the About file in earlier versions of the EB-docs. Seems to have been removed now. - - https://travis-ci.com/github/visit-dav/visit-website/builds/181169664 provides an example of what the logs look like when a spelling check fails - https://github.com/visit-dav/visit-website/blob/gh-pages/.travis.yml invokes a linter, a spell checker, and a lnk checker * Long ago, Will Mclendon of SNL wrote a script to validate an article's metadata. It would probably be useful to dust that off and update it. From 235745dbbcd2f7c7d102b5aceaad6c957573a949 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 27 Jun 2024 00:19:24 -0700 Subject: [PATCH 61/81] test parse fix --- .github/workflows/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index ec0535698c..8f5ac74cf3 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -27,7 +27,6 @@ Format: - job: assign_to_board - Assigns issues and PRs to either Content Development board or BSSw Internal board based on labels ("content: *" or "scope: site-internal") * notify-external-contrib.yml (Notify external contributions) - - https://github.com/betterscientificsoftware/bssw.io/blob/0e1ba1664239ab6097e903f8bb94fef08576f103/.github/workflows/notify-external-contrib.yml#L6-L12 - job: notify-external-contributions - Label and send email to bssw-editorial-list for externally opened issues, prs and discussions * check-urls.yml (Check URLs) @@ -73,5 +72,4 @@ Format: - Doesn't seem to work for PRs from forks * Mark C. Miller has introduced some useful MD-related processing into some of his other projects - There was some stuff document in the About file in earlier versions of the EB-docs. Seems to have been removed now. - - https://github.com/visit-dav/visit-website/blob/gh-pages/.travis.yml invokes a linter, a spell checker, and a lnk checker * Long ago, Will Mclendon of SNL wrote a script to validate an article's metadata. It would probably be useful to dust that off and update it. From 02b5458ddc3dd73835cfa50e155462c679dd1079 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 27 Jun 2024 23:52:34 -0700 Subject: [PATCH 62/81] bracket links with <> --- .github/workflows/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8f5ac74cf3..f8328cff74 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -27,12 +27,9 @@ Format: - job: assign_to_board - Assigns issues and PRs to either Content Development board or BSSw Internal board based on labels ("content: *" or "scope: site-internal") * notify-external-contrib.yml (Notify external contributions) + - - job: notify-external-contributions - Label and send email to bssw-editorial-list for externally opened issues, prs and discussions -* check-urls.yml (Check URLs) - - trigger: manual, scheduled, pull-request - - checks URLs in a PR or in whole repo. - # Gaps * PR is closed without merge. We should back out the whole PR from preview? Or kill and recreate preview? @@ -72,4 +69,6 @@ Format: - Doesn't seem to work for PRs from forks * Mark C. Miller has introduced some useful MD-related processing into some of his other projects - There was some stuff document in the About file in earlier versions of the EB-docs. Seems to have been removed now. + - provides an example of what the logs look like when a spelling check fails + - invokes a linter, a spell checker, and a lnk checker * Long ago, Will Mclendon of SNL wrote a script to validate an article's metadata. It would probably be useful to dust that off and update it. From 2310eff7c908caa23f1b6eab8167b3669c354c73 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Thu, 27 Jun 2024 23:52:53 -0700 Subject: [PATCH 63/81] use rc file --- .github/workflows/check-urls.yml | 26 +-- utils/LinkChecker/.linkcheckerrc | 298 +++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 17 deletions(-) create mode 100644 utils/LinkChecker/.linkcheckerrc diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 31406bf96f..c5f7d73140 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -65,18 +65,6 @@ jobs: #exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} - name: Check URLs in selected files run: | - # Create an rc file controlling behavior of linkchecker - echo " - [checking] - sslverify=0 - maxfilesizedownload=100000 - maxfilesizeparse=100000 - [csv] - separator=; - parts=all - [MarkdownCheck] - filename_re=.*\.md$ - " > .linkcheckerrc for f in ${{ steps.file_list.outputs.files }}; do if [ "${f##*.}" != "md" ]; then continue @@ -86,17 +74,15 @@ jobs: continue 2 # ignore this file fi done - linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true - #tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv - tail -n +4 linkchecker-out.csv >> linkchecker-out-all.csv + linkchecker -f utils/LinkChecker/.linkcheckerrc file://$(pwd)/$f >> linkchecker.out || true + cat linkchecker.out >> linkchecker-all.out done - name: Upload artifact uses: actions/upload-artifact@v4 with: name: my-artifact - path: linkchecker-out-all.csv - + path: linkchecker-all.out # # Keep the recurring failures and definitely bad lists in repo on @@ -117,6 +103,12 @@ jobs: # generate email with links or actual data # +# bare URLs in markdown are not actually links and will not be checked. Many +# markdown renderers and browsers will recognize these and handle them as links +# but that is by convention only. There is no markdown standard for how bare +# URLs in markdown are handled. The only standard is to enclose them in `<` and +# `>` chars. + # # Description: # diff --git a/utils/LinkChecker/.linkcheckerrc b/utils/LinkChecker/.linkcheckerrc new file mode 100644 index 0000000000..60b7f6f798 --- /dev/null +++ b/utils/LinkChecker/.linkcheckerrc @@ -0,0 +1,298 @@ +# Sample configuration file; see the linkcheckerrc(5) man page or +# execute linkchecker -h for help on these options. +# Commandline options override these settings. + +##################### output configuration ########################## +[output] +# enable debug messages; see 'linkchecker -h' for valid debug names, example: +#debug=all +# print status output +status=0 +# change the logging type +#log=text +# turn on/off --verbose +verbose=1 +# turn on/off --warnings +#warnings=1 +# turn on/off --quiet +#quiet=0 +# additional file output, example: +#fileoutput = text, html, gml, sql +# errors to ignore (URL regular expression, message regular expression) +#ignoreerrors= +# ignore all errors for broken.example.com: +# ^https?://broken.example.com/ +# ignore SSL errors for dev.example.com: +# ^https://dev.example.com/ ^SSLError + + +##################### logger configuration ########################## +# logger output part names: +# all For all parts +# realurl The full url link +# result Valid or invalid, with messages +# extern 1 or 0, only in some logger types reported +# base +# name name and name +# parenturl The referrer URL if there is any +# info Some additional info, e.g. FTP welcome messages +# warning Warnings +# dltime Download time +# checktime Check time +# url The original url name, can be relative +# intro The blurb at the beginning, "starting at ..." +# outro The blurb at the end, "found x errors ..." +# stats Statistics including URL lengths and contents. + +# each Logger can have separate configuration parameters + +# standard text logger +[text] +#filename=linkchecker-out.txt +#parts=all +#wraplength=65 +# colors for the various parts, syntax is or ; +# type can be bold, light, blink, invert +# color can be default, black, red, green, yellow, blue, purple, cyan, white, +# Black, Red, Green, Yellow, Blue, Purple, Cyan, White +#colorparent=default +#colorurl=default +#colorname=default +#colorreal=cyan +#colorbase=purple +#colorvalid=bold;green +#colorinvalid=bold;red +#colorinfo=default +#colorwarning=bold;yellow +#colordltime=default +#colorreset=default + +# GML logger +[gml] +#filename=linkchecker-out.gml +#parts=all +# valid encodings are listed in http://docs.python.org/library/codecs.html#standard-encodings +# example: +#encoding=utf_16 + +# DOT logger +[dot] +#filename=linkchecker-out.dot +#parts=all +# default encoding is ascii since the original DOT format does not +# support other charsets, example: +#encoding=iso-8859-15 + +# CSV logger +[csv] +#filename=linkchecker-out.csv +#separator=; +#quotechar=" +#dialect=excel +#parts=all + +# SQL logger +[sql] +#filename=linkchecker-out.sql +#dbname=linksdb +#separator=; +#parts=all + +# HTML logger +[html] +#filename=linkchecker-out.html +# colors for the various parts +#colorbackground=#fff7e5 +#colorurl=#dcd5cf +#colorborder=#000000 +#colorlink=#191c83 +#colorwarning=#e0954e +#colorerror=#db4930 +#colorok=#3ba557 +#parts=all + +# failures logger +[failures] +#filename=$XDG_DATA_HOME/linkchecker/failures + +# custom xml logger +[xml] +#filename=linkchecker-out.xml +# system encoding is used by default. Example: +#encoding=iso-8859-1 + +# GraphXML logger +[gxml] +#filename=linkchecker-out.gxml +# system encoding is used by default. Example: +#encoding=iso-8859-1 + +# Sitemap logger +[sitemap] +#filename=linkchecker-out.sitemap.xml +#encoding=utf-8 +#priority=0.5 +#frequency=daily + + +##################### checking options ########################## +[checking] +# number of threads +threads=20 +# connection timeout in seconds +timeout=30 +# Time to wait for checks to finish after the user aborts the first time +# (with Ctrl-C or the abort button). +#aborttimeout=300 +# The recursion level determines how many times links inside pages are followed. +#recursionlevel=-1 +# parse a cookiefile for initial cookie data, example: +#cookiefile=/path/to/cookies.txt +# User-Agent header string to send to HTTP web servers +# Note that robots.txt are always checked with the original User-Agent. Example: +#useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) +# When checking finishes, write a memory dump to a temporary file. +# The memory dump is written both when checking finishes normally +# and when checking gets canceled. +# The memory dump only works if the python-meliae package is installed. +# Otherwise a warning is printed to install it. +#debugmemory=0 +# When checking absolute URLs inside local files, the given root directory +# is used as base URL. +# Note that the given directory must have URL syntax, so it must use a slash +# to join directories instead of a backslash. +# And the given directory must end with a slash. +# Unix example: +#localwebroot=/var/www/ +# Windows example: +#localwebroot=/C|/public_html/ +# Check SSL certificates. Set to an absolute pathname for a custom +# CA cert bundle to use. Set to zero to disable SSL certificate verification. +sslverify=0 +# Stop checking new URLs after the given number of seconds. Same as if the +# user hits Ctrl-C after X seconds. Example: +#maxrunseconds=600 +# Don't download files larger than the given number of bytes +#maxfilesizedownload=5242880 +# Don't parse files larger than the given number of bytes +#maxfilesizeparse=1048576 +# Maximum number of URLs to check. New URLs will not be queued after the +# given number of URLs is checked. Example: +#maxnumurls=153 +# Maximum number of requests per second to one host. +#maxrequestspersecond=10 +# Respect the instructions in any robots.txt files +#robotstxt=1 +# Allowed URL schemes as a comma-separated list. Example: +allowedschemes=file,http,https +# Size of the result cache. Checking more urls might increase memory usage during runtime +#resultcachesize=100000 + +##################### filtering options ########################## +[filtering] +#ignore= +# ignore everything with 'lconline' in the URL name +# lconline +# and ignore everything with 'bookmark' in the URL name +# bookmark +# and ignore all mailto: URLs +# ^mailto: +# do not recurse into the following URLs + +#nofollow= +# just an example +# http://www\.example\.com/bla + +# Ignore specified warnings (see linkchecker -h for the list of +# recognized warnings). Add a comma-separated list of warnings here +# that prevent a valid URL from being logged. Note that the warning +# will be logged for invalid URLs. Example: +#ignorewarnings=url-unicode-domain +# Regular expression to add more URLs recognized as internal links. +# Default is that URLs given on the command line are internal. +#internlinks=^http://www\.example\.net/ +# Check external links +checkextern=1 + + +##################### password authentication ########################## +[authentication] +# WARNING: if you store passwords in this configuration entry, make sure the +# configuration file is not readable by other users. +# Different user/password pairs for different URLs can be provided. +# Entries are a triple (URL regular expression, username, password), +# separated by whitespace. +# If the regular expression matches, the given user/password pair is used +# for authentication. The commandline options -u,-p match every link +# and therefore override the entries given here. The first match wins. +# At the moment, authentication is used for http[s] and ftp links. +#entry= +# Note that passwords are optional. If any passwords are stored here, +# this file should not readable by other users. +# ^https?://www\.example\.com/~calvin/ calvin mypass +# ^ftp://www\.example\.com/secret/ calvin + +# if the website requires a login via a page with an HTML form the URL of the +# page and optionally the username and password input element name attributes +# can be provided. +#loginurl=http://www.example.com/ + +# The name attributes of the username and password HTML input elements +#loginuserfield=login +#loginpasswordfield=password +# Optionally the name attributes of any additional input elements and the values +# to populate them with. Note that these are submitted without checking +# whether matching input elements exist in the HTML form. Example: +#loginextrafields= +# name1:value1 +# name 2:value 2 + +############################ Plugins ################################### +# +# uncomment sections to enable plugins + +# Check HTML anchors +#[AnchorCheck] + +# Print HTTP header info +#[HttpHeaderInfo] +# Comma separated list of header prefixes to print. +# The names are case insensitive. +# The default list is empty, so it should be non-empty when activating +# this plugin. Example: +#prefixes=Server,X- + +# Add country info to URLs +#[LocationInfo] + +# Run W3C syntax checks +#[CssSyntaxCheck] +#[HtmlSyntaxCheck] + +# Search for regular expression in page contents +#[RegexCheck] +# Example: +#warningregex=Oracle Error + +# Search for viruses in page contents +#[VirusCheck] +#clamavconf=/etc/clamav/clamd.conf + +# Check that SSL certificates have at least the given number of days validity. +#[SslCertificateCheck] +#sslcertwarndays=30 + +# Parse and check links in PDF files +#[PdfParser] + +# Parse and check links in Word files +#[WordParser] + +# Parse and check links in Markdown files. +# Supported links are: +# +# [name](http://link.com "Optional title") +# [id]: http://link.com "Optional title" +[MarkdownCheck] +# Regexp of filename +filename_re=.*\.(markdown|md(own)?|mkdn?)$ From d1db74642083505da12e0b0becb5427425796895 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Fri, 28 Jun 2024 16:47:43 -0700 Subject: [PATCH 64/81] add python script to check results --- utils/LinkChecker/.linkcheckerrc | 2 +- utils/LinkChecker/cklcresults.py | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 utils/LinkChecker/cklcresults.py diff --git a/utils/LinkChecker/.linkcheckerrc b/utils/LinkChecker/.linkcheckerrc index 60b7f6f798..ed21eb031f 100644 --- a/utils/LinkChecker/.linkcheckerrc +++ b/utils/LinkChecker/.linkcheckerrc @@ -182,7 +182,7 @@ sslverify=0 # Maximum number of requests per second to one host. #maxrequestspersecond=10 # Respect the instructions in any robots.txt files -#robotstxt=1 +robotstxt=0 # Allowed URL schemes as a comma-separated list. Example: allowedschemes=file,http,https # Size of the result cache. Checking more urls might increase memory usage during runtime diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py new file mode 100644 index 0000000000..2c0134ea95 --- /dev/null +++ b/utils/LinkChecker/cklcresults.py @@ -0,0 +1,62 @@ +f = open("linkchecker-all.out") +lines = f.readlines() +f.close() + + +# +# Parse lines for status records which like like... +# +# URL `file:///home/runner/work/bssw.io/bssw.io/Articles/Blog/2020-01-usrse.md' +# . +# . +# . +# Result Valid + +def AddEntryToRecord(cr, key, line): + assert key not in cr.keys(), f"duplicate key {key}, {line}, {cr}" + cr[key] = line[11:] + +def StartNewRecord(cr, key, line): + cr.clear() + line = line[1:-1] + AddEntryToRecord(cr, key, line) + +def AddRecord(records, cr, key, line): + AddEntryToRecord(cr, key, line) + records.append(cr.copy()) + cr.clear() + +def AppendLineToPreviousKey(cr, key, line): + cr[key] += line + +records = [] +currentRecord = {} +inRecord = False +prevKey = '' +for line in lines: + + key = line[:11].replace(' ','') + + if inRecord and key == '' and prevKey != '': + AppendLineToPreviousKey(currentRecord, prevKey, line) + elif key == "URL": + assert not inRecord, "Problem starting new record" + inRecord = True + StartNewRecord(currentRecord, key, line) + elif key == "Result": + assert inRecord, "Problem ending new record" + inRecord = False + AddRecord(records, currentRecord, key, line) + elif inRecord: + AddEntryToRecord(currentRecord, key, line) + #else: + # print(f'Skipping "{line}"') + + if key != '': + prevKey = key + +for r in records: + if '200' not in r['Result']: + + print(r['Result']) + print(r['Info']) From 52e98366f1d74e4ab31e7180778e3da208785ff2 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Fri, 28 Jun 2024 17:08:39 -0700 Subject: [PATCH 65/81] fix space in link --- Articles/Blog/2021-12-sc21-swe-cse-bof.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Articles/Blog/2021-12-sc21-swe-cse-bof.md b/Articles/Blog/2021-12-sc21-swe-cse-bof.md index 3dfaab2be4..d335c5a101 100644 --- a/Articles/Blog/2021-12-sc21-swe-cse-bof.md +++ b/Articles/Blog/2021-12-sc21-swe-cse-bof.md @@ -16,7 +16,7 @@ We’ve settled on a format for the BoF that includes 3-minute lightning talks, We had six lightning talks this year, covering a wide range of topics with speakers from the US, UK, Germany, and Australia. -* [Ecosystems are the Future!](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/01-brown-ecosystems.pdf) by Benjamin Brown (Office of Advanced Scientific Computing Research (ASCR), Office of Science, U.S. Dept. of Energy). Ben discussed the importance of ecosystems in his vision for the future of the ASCR high performance computing and networking user facilities ([ALCF](https://www.alcf.anl.gov/), [NERSC](https://www.nersc.gov/), [OLCF](https://www.olcf.ornl.gov/), and[ ESnet](https://www.es.net/)) and highlighted scientific software as one of the key ecosystems. He closed with the message “Software ecosystems are research infrastructure!” +* [Ecosystems are the Future!](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/01-brown-ecosystems.pdf) by Benjamin Brown (Office of Advanced Scientific Computing Research (ASCR), Office of Science, U.S. Dept. of Energy). Ben discussed the importance of ecosystems in his vision for the future of the ASCR high performance computing and networking user facilities ([ALCF](https://www.alcf.anl.gov/), [NERSC](https://www.nersc.gov/), [OLCF](https://www.olcf.ornl.gov/), and [ESnet](https://www.es.net/)) and highlighted scientific software as one of the key ecosystems. He closed with the message “Software ecosystems are research infrastructure!” * [Open Source for Researchers](https://doi.org/10.5281/zenodo.5655022) by Yo Yehudi (Wellcome Trust). Yo encouraged us to up our game for our open source software projects, making them more open and more accessible to others. She touched on readme files, roadmaps, contributor guides, codes of conduct, a requested citation, contact information, and using an issue tracker and also suggested some resources to help folks get started contributing to open source software. * [The Internat. CSE Master Program at TUM](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/03-bader-masterprogram.pdf) by Michael Bader (Technical University of Munich, TUM). Michael described the International Master’s program in Computational Science and Engineering, which has been offered at TUM since 2001, currently serving approximately 50 students per year. The 4-semester program combines classes in computer science, numerical analysis, and scientific computing - with one of the key challenges being how to guide students from various backgrounds towards becoming experts in software development for supercomputing applications. * [Senior Level RSE career paths (with an s)](https://betterscientificsoftware.github.io/swe-cse-bof/assets/2021-11-sc21-bof/04-katz-seniorrse.pdf) by Daniel S. Katz (University of Illinois at Urbana-Champaign). Dan presented ideas to help define paths for career progression for research software engineers (RSEs), looking particularly at offering a richer set of opportunities at senior levels to allow RSEs to explore different roles, emphasizing different skills. From ca247b7c764080d9de225ed3f772192233f8aeb8 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 29 Jun 2024 12:40:58 -0700 Subject: [PATCH 66/81] set recursion depth --- utils/LinkChecker/.linkcheckerrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/LinkChecker/.linkcheckerrc b/utils/LinkChecker/.linkcheckerrc index ed21eb031f..181c28d656 100644 --- a/utils/LinkChecker/.linkcheckerrc +++ b/utils/LinkChecker/.linkcheckerrc @@ -145,7 +145,7 @@ timeout=30 # (with Ctrl-C or the abort button). #aborttimeout=300 # The recursion level determines how many times links inside pages are followed. -#recursionlevel=-1 +recursionlevel=1 # parse a cookiefile for initial cookie data, example: #cookiefile=/path/to/cookies.txt # User-Agent header string to send to HTTP web servers From 99b2a9ea2acd328903f02d9d1185ab8dbb202a7b Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 29 Jun 2024 14:11:44 -0700 Subject: [PATCH 67/81] adjust config --- utils/LinkChecker/.linkcheckerrc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/utils/LinkChecker/.linkcheckerrc b/utils/LinkChecker/.linkcheckerrc index 181c28d656..c2c1d95f5d 100644 --- a/utils/LinkChecker/.linkcheckerrc +++ b/utils/LinkChecker/.linkcheckerrc @@ -173,7 +173,7 @@ sslverify=0 # user hits Ctrl-C after X seconds. Example: #maxrunseconds=600 # Don't download files larger than the given number of bytes -#maxfilesizedownload=5242880 +maxfilesizedownload=100000 # Don't parse files larger than the given number of bytes #maxfilesizeparse=1048576 # Maximum number of URLs to check. New URLs will not be queued after the @@ -198,8 +198,7 @@ allowedschemes=file,http,https # and ignore all mailto: URLs # ^mailto: # do not recurse into the following URLs - -#nofollow= +nofollow=.* # just an example # http://www\.example\.com/bla @@ -210,7 +209,7 @@ allowedschemes=file,http,https #ignorewarnings=url-unicode-domain # Regular expression to add more URLs recognized as internal links. # Default is that URLs given on the command line are internal. -#internlinks=^http://www\.example\.net/ +#internlinks=.* # Check external links checkextern=1 From 969905fdbee231c972f1594916a3e960b6d3254c Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 29 Jun 2024 14:32:15 -0700 Subject: [PATCH 68/81] ignore gh usernames --- utils/LinkChecker/.linkcheckerrc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/LinkChecker/.linkcheckerrc b/utils/LinkChecker/.linkcheckerrc index c2c1d95f5d..a7e5fa87df 100644 --- a/utils/LinkChecker/.linkcheckerrc +++ b/utils/LinkChecker/.linkcheckerrc @@ -190,7 +190,8 @@ allowedschemes=file,http,https ##################### filtering options ########################## [filtering] -#ignore= +# Ignore github usernames +ignore=https?://github.com/[^/]*$ # ignore everything with 'lconline' in the URL name # lconline # and ignore everything with 'bookmark' in the URL name From 2bd4298f3ab1a713bed0bed278661a2181a9c3eb Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sat, 29 Jun 2024 14:34:17 -0700 Subject: [PATCH 69/81] update checker logic --- utils/LinkChecker/cklcresults.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 2c0134ea95..8b03922a56 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -14,11 +14,11 @@ def AddEntryToRecord(cr, key, line): assert key not in cr.keys(), f"duplicate key {key}, {line}, {cr}" - cr[key] = line[11:] + cr[key] = line[11:].replace('\n','') def StartNewRecord(cr, key, line): cr.clear() - line = line[1:-1] + line = line[1:-2] AddEntryToRecord(cr, key, line) def AddRecord(records, cr, key, line): @@ -56,7 +56,18 @@ def AppendLineToPreviousKey(cr, key, line): prevKey = key for r in records: - if '200' not in r['Result']: - - print(r['Result']) - print(r['Info']) + + # ignore the entry for the .md file itself + if 'ParentURL' not in r.keys(): + continue + + goodURL = True + if '200' not in r['Result'][:20]: + if r['Result'] == 'Valid': + if 'Warning' in r.keys() and 'Redirected' not in r['Warning']: + goodURL = False + else: + goodURL = False + + if not goodURL: + print(f"{r['URL']} in \n\t {r['ParentURL'][41:]}") From f53a1e0721b406a174d0462b6a655a65460ced36 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 11:24:27 -0700 Subject: [PATCH 70/81] add presistent lists --- utils/LinkChecker/bad_links.txt | 0 utils/LinkChecker/cklcresults.py | 103 ++++++++++++++++++++++------ utils/LinkChecker/trouble_links.txt | 0 3 files changed, 81 insertions(+), 22 deletions(-) create mode 100644 utils/LinkChecker/bad_links.txt create mode 100644 utils/LinkChecker/trouble_links.txt diff --git a/utils/LinkChecker/bad_links.txt b/utils/LinkChecker/bad_links.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 8b03922a56..0f8c87fcfb 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -1,16 +1,4 @@ -f = open("linkchecker-all.out") -lines = f.readlines() -f.close() - - -# -# Parse lines for status records which like like... -# -# URL `file:///home/runner/work/bssw.io/bssw.io/Articles/Blog/2020-01-usrse.md' -# . -# . -# . -# Result Valid +import ast, datetime def AddEntryToRecord(cr, key, line): assert key not in cr.keys(), f"duplicate key {key}, {line}, {cr}" @@ -21,18 +9,32 @@ def StartNewRecord(cr, key, line): line = line[1:-2] AddEntryToRecord(cr, key, line) -def AddRecord(records, cr, key, line): +def AddRecord(records, cr, key, line, now): AddEntryToRecord(cr, key, line) + AddEntryToRecord(cr, 'Date', now) records.append(cr.copy()) cr.clear() def AppendLineToPreviousKey(cr, key, line): cr[key] += line +# Open log and read all lines into a python list object +f = open('linkchecker-all.out','r') +lines = f.readlines() +f.close() + +# +# Process contents of log, looking for groups of lines defining +# each link tested and its results. Each link is its own dict +# object (record) and we append each record to a list, records. +# records = [] currentRecord = {} inRecord = False prevKey = '' +print("Consider date resolution for PRs\n") +nowdate = datetime.datetime.now() +nowstr = ' '*11 + nowdate.strftime("%Y-%m-%d") for line in lines: key = line[:11].replace(' ','') @@ -46,28 +48,85 @@ def AppendLineToPreviousKey(cr, key, line): elif key == "Result": assert inRecord, "Problem ending new record" inRecord = False - AddRecord(records, currentRecord, key, line) + AddRecord(records, currentRecord, key, line, nowstr) elif inRecord: AddEntryToRecord(currentRecord, key, line) - #else: - # print(f'Skipping "{line}"') if key != '': prevKey = key +# +# Open trouble_links.txt and read all contents +# +trouble_links = [] +with open('../../utils/LinkChecker/trouble_links.txt','r') as file: + for line in file: + trouble_links.append(ast.literal_eval(line.strip())) +trouble_links_original_size = len(trouble_links) + +# +# Open bad_links.log +# +bad_links = [] +with open('../../utils/LinkChecker/bad_links.txt','r') as file: + for line in file: + bad_links.append(ast.literal_eval(line.strip())) +bad_links_original_size = len(bad_links) + +# +# Process all the links, looking for trouble links. +# Links that have consistently been troubled are deemed bad. +# for r in records: # ignore the entry for the .md file itself if 'ParentURL' not in r.keys(): continue - goodURL = True + linkOK = True if '200' not in r['Result'][:20]: if r['Result'] == 'Valid': if 'Warning' in r.keys() and 'Redirected' not in r['Warning']: - goodURL = False + linkOK = False else: - goodURL = False + linkOK = False + + prev_trouble_records = [x for x in trouble_links if x['URL'] == r['URL']] + + if linkOK: + + # + # If link is good, make sure any instance of it being in trouble in + # the past is removed. + # + for x in prev_trouble_records: + try: + trouble_links.remove(x) + except: + pass + + else: - if not goodURL: - print(f"{r['URL']} in \n\t {r['ParentURL'][41:]}") + if prev_trouble_records == []: + trouble_links.append(r) + else: + for x in prev_trouble_records: + date = datetime.datetime.strptime(x['Date'],'%Y-%m-%d') + if (nowdate - date).days > 90: + bad_links += [x] + +# +# Update trouble_links file if modified +# +if len(trouble_links) != trouble_links_original_size: + with open('../../utils/LinkChecker/trouble_links.txt','w') as file: + for rec in trouble_links: + file.write(str(rec)+'\n') + +# +# Update bad_links file if modified +# +if len(bad_links) > bad_links_original_size: + with open('../../utils/LinkChecker/bad_links.txt','w') as file: + for rec in bad_links: + file.write(str(rec)+'\n') diff --git a/utils/LinkChecker/trouble_links.txt b/utils/LinkChecker/trouble_links.txt new file mode 100644 index 0000000000..e69de29bb2 From f024ad675ee41e65c814f439f3ec0baf4fb3f81f Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 20:45:39 -0700 Subject: [PATCH 71/81] whole process --- .github/workflows/check-urls.yml | 40 ++++++++++++++++++++++++++++---- utils/LinkChecker/cklcresults.py | 35 +++++++++++++++++++--------- 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index c5f7d73140..48c5331d13 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -41,8 +41,24 @@ jobs: tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ') echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT - - name: Checkout Repository + - name: Checkout Repo for PR branch + if: ${{ github.event_name == 'pull_request' }} + uses: actions/checkout@v4 + + - name: Checkout Repo for link check + if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v4 + with: + ref: 'sched-link-checks' + + - name: Sync main to link check branch + if: ${{ github.event_name != 'pull_request' }} + run: | + git config user.name 'github-actions' + git config user.email 'github-actions@github.com' + git fetch origin main + git merge origin/main --no-edit -X thiers + git push origin sched-link-checks - name: Get Changed Files (for PRs) if: ${{ github.event_name == 'pull_request' }} @@ -62,7 +78,6 @@ jobs: echo "ignore_file_patterns=${{ steps.setup_vars.outputs.ignore_file_patterns }}" >> $GITHUB_OUTPUT fi - #exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} - name: Check URLs in selected files run: | for f in ${{ steps.file_list.outputs.files }}; do @@ -78,11 +93,27 @@ jobs: cat linkchecker.out >> linkchecker-all.out done + - name: Process log + run: | + python utils/LinkChecker/cklcresults.py ${{ github.event_name }} + + - name: Finalize Check Status + if: ${{ github.event_name == 'pull_request' }} + run: | + [ $(wc -c utils/LinkChecker/bad_links.txt) -gt 0 ] && exit 1 + - name: Upload artifact + if: always() uses: actions/upload-artifact@v4 with: - name: my-artifact - path: linkchecker-all.out + name: bad-links + path: utils/LinkChecker/bad_links.txt + + - name: Update link logs + if: ${{ github.event_name != 'pull_request' }} + run: | + git commit -m 'Update link logs' + git push origin sched-link-checks # # Keep the recurring failures and definitely bad lists in repo on @@ -102,6 +133,7 @@ jobs: # Report success if definitely bad list is empty, otherwise failure # generate email with links or actual data # +# you have to use file:// on command-line to checker # bare URLs in markdown are not actually links and will not be checked. Many # markdown renderers and browsers will recognize these and handle them as links diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 0f8c87fcfb..3dbf52040d 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -1,4 +1,4 @@ -import ast, datetime +import ast, datetime, sys def AddEntryToRecord(cr, key, line): assert key not in cr.keys(), f"duplicate key {key}, {line}, {cr}" @@ -18,15 +18,22 @@ def AddRecord(records, cr, key, line, now): def AppendLineToPreviousKey(cr, key, line): cr[key] += line +# +# Handle command-line args to this script +ghEvent = "" +if len(sys.argv) > 1: + ghEvent = sys.argv[1] + +# # Open log and read all lines into a python list object f = open('linkchecker-all.out','r') lines = f.readlines() f.close() # -# Process contents of log, looking for groups of lines defining -# each link tested and its results. Each link is its own dict -# object (record) and we append each record to a list, records. +# Process contents of linkchecker log, looking for groups of lines +# defining each link tested and its results. Each link is its own dict +# object (record) and we append each record to a list named records. # records = [] currentRecord = {} @@ -59,18 +66,20 @@ def AppendLineToPreviousKey(cr, key, line): # Open trouble_links.txt and read all contents # trouble_links = [] -with open('../../utils/LinkChecker/trouble_links.txt','r') as file: - for line in file: - trouble_links.append(ast.literal_eval(line.strip())) +if ghEvent != "pull_request": + with open('../../utils/LinkChecker/trouble_links.txt','r') as file: + for line in file: + trouble_links.append(ast.literal_eval(line.strip())) trouble_links_original_size = len(trouble_links) # # Open bad_links.log # bad_links = [] -with open('../../utils/LinkChecker/bad_links.txt','r') as file: - for line in file: - bad_links.append(ast.literal_eval(line.strip())) +if ghEvent != "pull_request": + with open('../../utils/LinkChecker/bad_links.txt','r') as file: + for line in file: + bad_links.append(ast.literal_eval(line.strip())) bad_links_original_size = len(bad_links) # @@ -91,6 +100,10 @@ def AppendLineToPreviousKey(cr, key, line): else: linkOK = False + if ghEvent == "pull_request" and not linkOK: + bad_links += [r] + continue + prev_trouble_records = [x for x in trouble_links if x['URL'] == r['URL']] if linkOK: @@ -118,7 +131,7 @@ def AppendLineToPreviousKey(cr, key, line): # # Update trouble_links file if modified # -if len(trouble_links) != trouble_links_original_size: +if ghEvent != "pull_request" and len(trouble_links) != trouble_links_original_size: with open('../../utils/LinkChecker/trouble_links.txt','w') as file: for rec in trouble_links: file.write(str(rec)+'\n') From 7b0e42e9eb3a0e8586474a2280d42a07add9bb7d Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 20:50:16 -0700 Subject: [PATCH 72/81] fix paths --- utils/LinkChecker/cklcresults.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 3dbf52040d..fea8974cb8 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -67,7 +67,7 @@ def AppendLineToPreviousKey(cr, key, line): # trouble_links = [] if ghEvent != "pull_request": - with open('../../utils/LinkChecker/trouble_links.txt','r') as file: + with open('utils/LinkChecker/trouble_links.txt','r') as file: for line in file: trouble_links.append(ast.literal_eval(line.strip())) trouble_links_original_size = len(trouble_links) @@ -77,7 +77,7 @@ def AppendLineToPreviousKey(cr, key, line): # bad_links = [] if ghEvent != "pull_request": - with open('../../utils/LinkChecker/bad_links.txt','r') as file: + with open('utils/LinkChecker/bad_links.txt','r') as file: for line in file: bad_links.append(ast.literal_eval(line.strip())) bad_links_original_size = len(bad_links) @@ -132,7 +132,7 @@ def AppendLineToPreviousKey(cr, key, line): # Update trouble_links file if modified # if ghEvent != "pull_request" and len(trouble_links) != trouble_links_original_size: - with open('../../utils/LinkChecker/trouble_links.txt','w') as file: + with open('utils/LinkChecker/trouble_links.txt','w') as file: for rec in trouble_links: file.write(str(rec)+'\n') @@ -140,6 +140,6 @@ def AppendLineToPreviousKey(cr, key, line): # Update bad_links file if modified # if len(bad_links) > bad_links_original_size: - with open('../../utils/LinkChecker/bad_links.txt','w') as file: + with open('utils/LinkChecker/bad_links.txt','w') as file: for rec in bad_links: file.write(str(rec)+'\n') From f840b174902a270ac8e45debec5bd7b83349af34 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 20:54:09 -0700 Subject: [PATCH 73/81] upd bad_links only if !PR --- utils/LinkChecker/cklcresults.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index fea8974cb8..7b476f95ec 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -131,7 +131,7 @@ def AppendLineToPreviousKey(cr, key, line): # # Update trouble_links file if modified # -if ghEvent != "pull_request" and len(trouble_links) != trouble_links_original_size: +if ghEvent != 'pull_request' and len(trouble_links) != trouble_links_original_size: with open('utils/LinkChecker/trouble_links.txt','w') as file: for rec in trouble_links: file.write(str(rec)+'\n') @@ -139,7 +139,7 @@ def AppendLineToPreviousKey(cr, key, line): # # Update bad_links file if modified # -if len(bad_links) > bad_links_original_size: +if ghEvent != 'pull_request' and len(bad_links) > bad_links_original_size: with open('utils/LinkChecker/bad_links.txt','w') as file: for rec in bad_links: file.write(str(rec)+'\n') From 46f77a109bf6574ad40ddcb1da33af9a83424290 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 22:36:08 -0700 Subject: [PATCH 74/81] fix pr bad count logic --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 48c5331d13..fa9f088ac8 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -100,7 +100,7 @@ jobs: - name: Finalize Check Status if: ${{ github.event_name == 'pull_request' }} run: | - [ $(wc -c utils/LinkChecker/bad_links.txt) -gt 0 ] && exit 1 + [ $(wc -l ../../utils/LinkChecker/bad_links.txt | awk '{print $1}') -gt 0 ] && exit 1 - name: Upload artifact if: always() From 64ed9e89836548c26ce5ad5faea939ca3ad9f7e8 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 22:46:33 -0700 Subject: [PATCH 75/81] fix path --- .github/workflows/check-urls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index fa9f088ac8..1732ea06c9 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -100,7 +100,7 @@ jobs: - name: Finalize Check Status if: ${{ github.event_name == 'pull_request' }} run: | - [ $(wc -l ../../utils/LinkChecker/bad_links.txt | awk '{print $1}') -gt 0 ] && exit 1 + [ $(wc -l utils/LinkChecker/bad_links.txt | awk '{print $1}') -gt 0 ] && exit 1 - name: Upload artifact if: always() From 9ecf2966b10cfee35a25d517f58a5f1d5e572c21 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Sun, 30 Jun 2024 23:51:56 -0700 Subject: [PATCH 76/81] adjust failure logic --- .github/workflows/check-urls.yml | 9 ++------- utils/LinkChecker/cklcresults.py | 7 +++++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 1732ea06c9..7124b93018 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -97,17 +97,12 @@ jobs: run: | python utils/LinkChecker/cklcresults.py ${{ github.event_name }} - - name: Finalize Check Status - if: ${{ github.event_name == 'pull_request' }} - run: | - [ $(wc -l utils/LinkChecker/bad_links.txt | awk '{print $1}') -gt 0 ] && exit 1 - - name: Upload artifact - if: always() + if: ${{ github.event_name == 'pull_request' }} uses: actions/upload-artifact@v4 with: name: bad-links - path: utils/LinkChecker/bad_links.txt + path: bad_links.txt - name: Update link logs if: ${{ github.event_name != 'pull_request' }} diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 7b476f95ec..7dacdb3213 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -139,7 +139,10 @@ def AppendLineToPreviousKey(cr, key, line): # # Update bad_links file if modified # -if ghEvent != 'pull_request' and len(bad_links) > bad_links_original_size: - with open('utils/LinkChecker/bad_links.txt','w') as file: +bad_links_out = 'utils/LinkChecker/bad_links.txt' +if ghEvent == 'pull_request': + bad_links_out = 'bad_links.txt' +if len(bad_links) > bad_links_original_size: + with open(bad_links_out, 'w') as file: for rec in bad_links: file.write(str(rec)+'\n') From fbdb9bd182f92929d12efdc8a21bf80dd5f1bcf0 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 1 Jul 2024 08:18:21 -0700 Subject: [PATCH 77/81] adjust PR artifact --- utils/LinkChecker/cklcresults.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 7dacdb3213..d2eecf241a 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -94,7 +94,7 @@ def AppendLineToPreviousKey(cr, key, line): linkOK = True if '200' not in r['Result'][:20]: - if r['Result'] == 'Valid': + if r['Result'][:5] == 'Valid': if 'Warning' in r.keys() and 'Redirected' not in r['Warning']: linkOK = False else: @@ -128,6 +128,13 @@ def AppendLineToPreviousKey(cr, key, line): if (nowdate - date).days > 90: bad_links += [x] +# +# Sometimes, we can get duplicate entries in the lists. +# Remove any of those now. +# +trouble_links = list(set(trouble_links)) +bad_links = list(set(bad_links)) + # # Update trouble_links file if modified # @@ -145,4 +152,7 @@ def AppendLineToPreviousKey(cr, key, line): if len(bad_links) > bad_links_original_size: with open(bad_links_out, 'w') as file: for rec in bad_links: - file.write(str(rec)+'\n') + if ghEvent == 'pull_request': + file.write(+str(rec['ParentURL'][41:]+', '+str(rec['URL']+'\n') + else: + file.write(str(rec)+'\n') From 55beebc4ee48e72788c6143c7dc381f4b036880a Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 1 Jul 2024 08:23:10 -0700 Subject: [PATCH 78/81] fix syntax error --- utils/LinkChecker/cklcresults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index d2eecf241a..783aa3a30c 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -153,6 +153,6 @@ def AppendLineToPreviousKey(cr, key, line): with open(bad_links_out, 'w') as file: for rec in bad_links: if ghEvent == 'pull_request': - file.write(+str(rec['ParentURL'][41:]+', '+str(rec['URL']+'\n') + file.write(+str(rec['ParentURL'][41:])+', '+str(rec['URL'])+'\n') else: file.write(str(rec)+'\n') From 6fbe1f66f8f0c33b262d4a22661d7cd07cbadc00 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 1 Jul 2024 08:33:00 -0700 Subject: [PATCH 79/81] fix remove dups --- utils/LinkChecker/cklcresults.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 783aa3a30c..6f5a834451 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -128,12 +128,27 @@ def AppendLineToPreviousKey(cr, key, line): if (nowdate - date).days > 90: bad_links += [x] +# +# Removing dups from a list is ordinarily easier than this. +# But, in our case, each item in the list is a dict and so +# the logic to handle those correctly is a little more complex. +# +def RemoveDups(alist): + seen = set() + new_list = [] + for d in alist: + dict_fs = frozenset(d.items()) + if dict_fs not in seen: + new_list.append(d) + seen.add(dict_fs) + return new_list + # # Sometimes, we can get duplicate entries in the lists. # Remove any of those now. # -trouble_links = list(set(trouble_links)) -bad_links = list(set(bad_links)) +trouble_links = RemoveDups(trouble_links) +bad_links = RemoveDups(bad_links) # # Update trouble_links file if modified From ce73e77244ad13394a9577b4c41b07c2247e92a8 Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 1 Jul 2024 08:37:26 -0700 Subject: [PATCH 80/81] fix stray + char --- utils/LinkChecker/cklcresults.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/LinkChecker/cklcresults.py b/utils/LinkChecker/cklcresults.py index 6f5a834451..8061eb6a32 100644 --- a/utils/LinkChecker/cklcresults.py +++ b/utils/LinkChecker/cklcresults.py @@ -168,6 +168,6 @@ def RemoveDups(alist): with open(bad_links_out, 'w') as file: for rec in bad_links: if ghEvent == 'pull_request': - file.write(+str(rec['ParentURL'][41:])+', '+str(rec['URL'])+'\n') + file.write(str(rec['ParentURL'][41:]) + ', ' + str(rec['URL']) + '\n') else: - file.write(str(rec)+'\n') + file.write(str(rec) + '\n') From c239c6a69c83a979dd5739925b7805c51025317a Mon Sep 17 00:00:00 2001 From: "Mark C. Miller" Date: Mon, 1 Jul 2024 10:31:33 -0700 Subject: [PATCH 81/81] fix ignore files feature --- .github/workflows/check-urls.yml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index 7124b93018..04abea815a 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -7,15 +7,11 @@ on: pull_request: branches: [ main ] env: - ignore_url_patterns: | - http://localhost:4000 - https://preview.bssw.io - https://github.com/ ignore_file_patterns: | - docs/ - images/ - utils/ - Events/ + docs + images + utils + Events jobs: check-urls: @@ -36,8 +32,6 @@ jobs: - name: Reformat environment variables id: setup_vars run: | - tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ' ') - echo "ignore_url_patterns=$tmp" >> $GITHUB_OUTPUT tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ') echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT @@ -88,6 +82,10 @@ jobs: if [ "$ef" = "$f" ]; then continue 2 # ignore this file fi + fd=$(echo $f | cut -d'/' -f1) + if [ "$ef" = "$fd" ]; then + continue 2 # ignore this dir + fi done linkchecker -f utils/LinkChecker/.linkcheckerrc file://$(pwd)/$f >> linkchecker.out || true cat linkchecker.out >> linkchecker-all.out