diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2005-03-29 14:08:05 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2005-03-29 14:08:05 +0200 |
commit | 61846f4d01e6e9a15b78f4c82e80fa6e711c3cd8 (patch) | |
tree | 4248c91ac2c78ddb33a1cfe160c26234c3e70e7c | |
parent | 4be06ae90467f04335d031ec8e78525167941d45 (diff) |
import of release 1.01.0
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@2 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | BUGS | 16 | ||||
-rw-r--r-- | CHANGES | 136 | ||||
-rw-r--r-- | COPYING | 340 | ||||
-rw-r--r-- | CREDITS | 16 | ||||
-rw-r--r-- | HISTORY | 567 | ||||
-rw-r--r-- | HISTORY.linbot | 262 | ||||
-rw-r--r-- | INSTALL | 180 | ||||
-rw-r--r-- | README | 41 | ||||
-rw-r--r-- | TODO | 24 | ||||
-rw-r--r-- | config.py | 157 | ||||
-rw-r--r-- | contrib/plugins/about.py | 47 | ||||
-rw-r--r-- | debugio.py | 34 | ||||
-rw-r--r-- | htmlparse.py | 129 | ||||
-rw-r--r-- | httpcodes.py | 58 | ||||
-rw-r--r-- | myUrlLib.py | 303 | ||||
-rw-r--r-- | plugins/__init__.py | 17 | ||||
-rw-r--r-- | plugins/badlinks.py | 56 | ||||
-rw-r--r-- | plugins/external.py | 40 | ||||
-rw-r--r-- | plugins/images.py | 58 | ||||
-rw-r--r-- | plugins/notchkd.py | 46 | ||||
-rw-r--r-- | plugins/notitles.py | 47 | ||||
-rw-r--r-- | plugins/problems.py | 53 | ||||
-rw-r--r-- | plugins/rptlib.py | 290 | ||||
-rw-r--r-- | plugins/sitemap.py | 79 | ||||
-rw-r--r-- | plugins/slow.py | 61 | ||||
-rw-r--r-- | plugins/whatsnew.py | 49 | ||||
-rw-r--r-- | plugins/whatsold.py | 50 | ||||
-rw-r--r-- | robotparser.py | 103 | ||||
-rw-r--r-- | schemes/__init__.py | 18 | ||||
-rw-r--r-- | schemes/filelink.py | 57 | ||||
-rw-r--r-- | schemes/ftplink.py | 125 | ||||
-rw-r--r-- | schemes/httplink.py | 167 | ||||
-rw-r--r-- | version.py | 24 | ||||
-rw-r--r-- | webcheck.css | 126 | ||||
-rwxr-xr-x | webcheck.py | 145 | ||||
-rwxr-xr-x | webcheck.sh | 4 |
36 files changed, 3925 insertions, 0 deletions
@@ -0,0 +1,16 @@ +Bug report sould be sent to the webcheck mailing list. If you absolutely +cannot subscribe to the mailing list then you may report bugs to +mwm@mired.org. See INSTALL for details. + +Known bugs: + +I tried webcheck on a site that used FrontPage publishing on IIS and +IIS reports error 406 whenever webcheck attemts to retieve the HEADers +for a document in one of the "underscore" directories. I'm not yet +sure why this happens, but I doubt its really webcheck's fault. I +might just have to code a way around it. In the meantime, you can +usually yank these URLs with -y '/_' or something similiar. + +Some (IIS?) servers seem to be reporting -1 as an HTTP status code. +I'm not sure what that means or what to do about it. + @@ -0,0 +1,136 @@ +Changes in webcheck 1.0 + ++ Don't send accept headers, as they weren't valid. + ++ WARN_OLD_VERSION no longer works, until I decide what to do about it. + ++ Named changed to webcheck. + ++ Fixed typos in INSTALL. + ++ Changes so it works with python 2.0. + +Changes in 1.0b10 + +b Fixed bug when server redirects to a document in robots.txt (does not show + up as broken (hopefully)) + ++ Filename mangling in filelink.py to help OS/2 (and Win32) (Patch submitted + by Steffen Siebert <siebert@logware.de> + ++ Added WARN_OLD_VERSION config.py option. If this option is set to true + (the default) Linbot will check it's version number and the version + numbers of it's plugins against a global registry on the Net. If it + finds that a version is not the latest, it will print a warning on the + reports along with a link you can follow to download the latest version. + I think it's neat. You might find it annoying. + ++ Added preliminary support for authenticating proxies, though it does not + work correctly yet. + ++ Added -r (redirect depth) and REDIRECT_DEPTH option in config.py to indicate + the amount of redirects Linbot should follow when following a link. Thanks + to Andrea Glorioso <sama@intercity.it> for the patch. + ++ Added debugio module that handles debugging and I/O + ++ Added -q (quiet option). Use it to suppress output + ++ Added -d (debug) option and DEBUG_LEVEL variable in config.py for debugging + ++ added version module and removed __version__ and __author__ from all the + modules (except plugins). + +b Fixed bug in Linbot using putrequest() instead of putheader() when requesting + header information. Thanks to Andrea Glorioso <sama@intercity.it> for + fixing this glitch (and Seth Chaiklin <seth@psy.au.dk> for noticing). + +Changes in 1.0b9 + ++ If you use the -o command-line option or the OUTPUT_DIR config file option + and the directory does not exist, linbot will create it for you (provided + that it has the correct permissions, etc.) Thanks to Andrea Glorioso + <sama@intercity.it> for this feature. + ++ Added a CREDITS file and probably left a lot of people out. If you think + you should be in it let me know (marduk@python.net). + +b Linbot will now report to the server that it can accept any MIME type (found + in mimetypes.py. This should fix the "406: No acceptable objects found" + error that some servers report. + +b Linbot correctly identifies itself as "Linbot <version>" on HEAD requests + as well as GET requests. + +Changes in 1.0b8 + +b Fixed bug when no images are reported for documents having 0 links + If you don't know what this means it probably wasn't a problem for you. + +b Fixed code that was messing with arguments passed via -x and -y and caused + unexpected results and/or errors. + +b -b flag should work this time (for real) + +b Cosmetic changes (reports didn't look the way I thought they should in IE4. + (and may not still as I havent' had a chance to check it yet) + +b Linbot won't follow infinite redirects (currently hardcoded to max of 5 + redirects per document) + +Changes in 1.0b6 + ++ Minor change in ftplink.py should allow better ftp link checking + ++ You can now press CTRL-C (or whatever your operating system supports) to break + out of a linbot run. However, the work linbot does is not saved (yet). + +b Fixed problem when server redirects a URL to itself. This fix seems to work + for most servers I've tried but there are a few more out there that I need to + take a look at. + +b Fixed bug that caused linbot to not check for yanked URLs + ++ Added -l command-line option. Usage: -l <url> where <url> is a url pointing + to an image to be used as the report's logo. + +b "patched" strings.py so that it can better parse html files created in + Windows/DOS (I think). + ++ Made report LOGO a link to the base url + ++ httplink does not HEAD a redirected URL if it is already in the link list + (performance improvement) + +- Removed LOGO_ALT from config.py + ++ Changed my email address to marduk@python.net. The official home page of + Linbot will probaby also change with the next release so stay tuned. + +Changes in 1.0b5 (from 1.0b4) + ++ Added a contrib directory. Right now it just contains the about plugin. Other + plugins will be included if people contribute them. Also, the man page will + return once I have updated it. Those ugly buttons are obsolete. + ++ Linbot now "inlines" stylesheets. This has the benefits of 1) better support + of Netscape browsers (so I hear) and 2) I don't have to document to put + linbot.css in the output directory since it grabs it from starship 8*) + +b Handling of error for when robots.txt cannot be retreived. + ++ Malformed urls are trapped (sorry, I had that commented out) + +b FTP link handling is totally rewritten. Fortunately it shouldn't crash anymore + Unfortunately it doesn't really work reliably and probably never will. See + README.ftp for details. + +b Two bugs in HTTP proxy handling made it almost completely unusable, though + conveniently seemed to cancel each other out when I was testing. + +b Too many files error on large sites should be fixed. Thanks to Andrew Kuchling + et al for suggestions. + +b Bug when some servers erroneously report (or don't report) Content-Length header + fixed. + @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) 19yy <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. @@ -0,0 +1,16 @@ +The following entities have contributed to webcheck in some fashion. Please +know that this is not a complete list. Most likely I've forgotten someone +If you would like to be included/removed from this list, send email to +mwm@mired.org. Thank you to all the contributers to webcheck. + + + Contributers + ---------------------------------------- + +Mike Meyer mwm@mired.org +Marduk marduk@python.net +Oleg Broytmann phd2@earthlink.net +Andrea Glorioso sama@intercity.it +Andrew Kuchling akuchlin@cnri.reston.va.us +Jean Pierre LeJacq jplejacq@quoininc.com +Steffen Siebert siebert@logware.de @@ -0,0 +1,567 @@ +//depot/mwm/webcheck/BUGS +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/CHANGES +... #3 change 2087 edit on 2002/04/02 by mwm@guru (text) + + Note that we don't send accept headers any more, and fix the URL for + linkbot in README. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/COPYING +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/CREDITS +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/HISTORY.linbot +... #1 change 2185 add on 2002/05/04 by mwm@guru (text) + + Add the linbot history file. + +//depot/mwm/webcheck/INSTALL +... #3 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #2 change 2079 edit on 2002/04/02 by mwm@guru (text) + + Apply the patches from the FreeBSD port. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/README +... #3 change 2087 edit on 2002/04/02 by mwm@guru (text) + + Note that we don't send accept headers any more, and fix the URL for + linkbot in README. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/TODO +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/config.py +... #4 change 2083 edit on 2002/04/02 by mwm@guru (text) + + Change config.py to match my own version. + +... #3 change 2082 edit on 2002/04/02 by mwm@guru (text) + + Move the stylesheet and LOGO references from marduk's - now + non-existent - site. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/debugio.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/htmlparse.py +... #4 change 2156 edit on 2002/04/28 by mwm@guru (text) + + Deal with ambiguous tabs in the source. + +... #3 change 2090 edit on 2002/04/02 by mwm@guru (text) + + Fix "import *"'s that caused 2.2 to choke. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/httpcodes.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/myUrlLib.py +... #5 change 2156 edit on 2002/04/28 by mwm@guru (text) + + Deal with ambiguous tabs in the source. + +... #4 change 2085 edit on 2002/04/02 by mwm@guru (text) + + Change the "import *"'s that were causing problems to import just the + one name we needed. + +... #3 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #2 change 2079 edit on 2002/04/02 by mwm@guru (text) + + Apply the patches from the FreeBSD port. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/robotparser.py +... #3 change 2156 edit on 2002/04/28 by mwm@guru (text) + + Deal with ambiguous tabs in the source. + +... #2 change 2079 edit on 2002/04/02 by mwm@guru (text) + + Apply the patches from the FreeBSD port. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/version.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/webcheck.css +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2080 branch on 2002/04/02 by mwm@guru (text) + + Stage one of the rename - fix the file names. + +... ... branch from //depot/mwm/webcheck/linbot.css#1 +//depot/mwm/webcheck/webcheck.py +... #4 change 2156 edit on 2002/04/28 by mwm@guru (xtext) + + Deal with ambiguous tabs in the source. + +... #3 change 2091 edit on 2002/04/02 by mwm@guru (xtext) + + Change one last "import *". + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (xtext) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2080 branch on 2002/04/02 by mwm@guru (xtext) + + Stage one of the rename - fix the file names. + +... ... branch from //depot/mwm/webcheck/linbot.py#1 +//depot/mwm/webcheck/webcheck.sh +... #4 change 2157 edit on 2002/04/28 by mwm@guru (xtext) + + Add the director the python binary resides in to the PATH. + +... #3 change 2084 edit on 2002/04/02 by mwm@guru (xtext) + + Fix the program name to be src, not external-src. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (xtext) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2080 branch on 2002/04/02 by mwm@guru (xtext) + + Stage one of the rename - fix the file names. + +... ... branch from //depot/mwm/webcheck/linbot.sh#1 +//depot/mwm/webcheck/plugins/__init__.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/badlinks.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/external.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/images.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/notchkd.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/notitles.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/problems.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/rptlib.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/sitemap.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/slow.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/whatsnew.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/plugins/whatsold.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/schemes/__init__.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/schemes/filelink.py +... #3 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #2 change 2079 edit on 2002/04/02 by mwm@guru (text) + + Apply the patches from the FreeBSD port. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/schemes/ftplink.py +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + +//depot/mwm/webcheck/schemes/httplink.py +... #5 change 2186 edit on 2002/05/04 by mwm@guru (text) + + Revert the accept headers - leave them out. + +... #4 change 2184 edit on 2002/05/04 by mwm@guru (text) + + Put back the accept headers. The problem appeared to be elsewhere. + +... #3 change 2086 edit on 2002/04/02 by mwm@guru (text) + + Rip out the accept: headers. They are making some servers choke for + some reason. + +... #2 change 2081 edit on 2002/04/02 by mwm@guru (text) + + Rename, phase 2 - change internal references from "linbot" to "webcheck". + + Also add my copyright, standardize the GNU copyright header, and rip out + the CVS cruft that we're not going to use. + + Document this in changes. + +... #1 change 2078 add on 2002/04/02 by mwm@guru (text) + + Check in linbot with the webcheck name, in preperation for the + rename for my distribution. + diff --git a/HISTORY.linbot b/HISTORY.linbot new file mode 100644 index 0000000..24ffc3a --- /dev/null +++ b/HISTORY.linbot @@ -0,0 +1,262 @@ +# $Log: debugio.py,v $ +# Revision 1.1 1999/03/11 02:29:50 marduk +# Added debugio module to handle debugging and IO + +# $Log: htmlparse.py,v $ +# Revision 1.5 1999/03/11 04:51:25 marduk +# Added version module. +# +# Revision 1.4 1999/03/11 02:29:50 marduk +# Added debugio module to handle debugging and IO +# +# Revision 1.3 1999/02/21 16:39:24 marduk +# 1.0b8 +# +# Revision 1.2 1999/01/10 01:01:44 marduk +# Linbot 1.0b6 +# +# Revision 1.1 1998/12/23 02:12:15 marduk +# This is 1.0b1 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 + + +# $Log: httpcodes.py,v $ +# Revision 1.2 1999/01/10 01:01:44 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 + +# $Log: __init__.py,v $ +# Revision 1.2 1999/01/10 01:02:02 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: badlinks.py,v $ +# Revision 1.2 1999/01/10 01:02:02 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: external.py,v $ +# Revision 1.2 1999/01/10 01:02:02 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: images.py,v $ +# Revision 1.3 1999/02/21 16:39:43 marduk +# 1.0b8 +# +# Revision 1.2 1999/01/10 01:02:03 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: notchkd.py,v $ +# Revision 1.2 1999/01/10 01:02:03 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: notitles.py,v $ +# Revision 1.2 1999/01/10 01:02:03 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: problems.py,v $ +# Revision 1.2 1999/01/10 01:02:03 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + + $Log: rptlib.py,v $ +# Revision 1.8 1999/03/12 04:56:05 marduk +# Added ability to warn of old versions of linbot/plugins +# Added patch to enable file:// to work with OS/2 +# +# Revision 1.7 1999/03/11 04:51:28 marduk +# Added version module. +# +# Revision 1.6 1999/03/11 02:30:00 marduk +# Added debugio module to handle debugging and IO +# +# Revision 1.5 1999/02/26 01:12:15 marduk +# -o option Created directory if does not exist. +# +# Revision 1.4 1999/02/21 16:39:44 marduk +# 1.0b8 +# +# Revision 1.3 1999/01/10 01:02:04 marduk +# Linbot 1.0b6 +# +# Revision 1.2 1998/12/31 03:49:08 marduk +# This is linbot 1.0b5. See CHANGES +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: sitemap.py,v $ +# Revision 1.2 1999/01/10 01:02:04 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: slow.py,v $ +# Revision 1.2 1999/01/10 01:02:04 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: whatsnew.py,v $ +# Revision 1.2 1999/01/10 01:02:05 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: whatsold.py,v $ +# Revision 1.2 1999/01/10 01:02:05 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Log: __init__.py,v $ +# Revision 1.3 1999/03/11 04:51:32 marduk +# Added version module. +# +# Revision 1.2 1999/01/10 01:02:15 marduk +# Linbot 1.0b6 +# +# Revision 1.1.1.1 1998/12/20 23:17:13 marduk +# initial 1.0 +# + +# $Log: filelink.py,v $ +# Revision 1.5 1999/03/12 04:56:07 marduk +# Added ability to warn of old versions of linbot/plugins +# Added patch to enable file:// to work with OS/2 +# +# Revision 1.4 1999/03/11 04:51:32 marduk +# Added version module. +# +# Revision 1.3 1999/01/10 01:02:15 marduk +# Linbot 1.0b6 +# +# Revision 1.2 1998/12/31 03:49:14 marduk +# This is linbot 1.0b5. See CHANGES +# +# Revision 1.1.1.1 1998/12/20 23:17:13 marduk +# initial 1.0 +# + +# $Log: ftplink.py,v $ +# Revision 1.6 1999/03/11 04:51:32 marduk +# Added version module. +# +# Revision 1.5 1999/03/11 02:30:05 marduk +# Added debugio module to handle debugging and IO +# +# Revision 1.4 1999/01/10 01:02:15 marduk +# Linbot 1.0b6 +# +# Revision 1.3 1998/12/31 03:49:14 marduk +# This is linbot 1.0b5. See CHANGES +# +# Revision 1.2 1998/12/23 07:38:35 marduk +# Fix bug: NameError: myUrlLib +# +# Revision 1.1.1.1 1998/12/20 23:17:14 marduk +# initial 1.0 +# + +# $Log: httplink.py,v $ +# Revision 1.11 1999/03/14 19:24:25 marduk +# Fixed bug when server redirects to a document in robots.txt +# +# Revision 1.10 1999/03/12 01:48:21 marduk +# Preliminary support for authenticating proxies added. +# Added Andrea's redirect-depth patch. +# +# Revision 1.9 1999/03/11 04:51:33 marduk +# Added version module. +# +# Revision 1.8 1999/03/11 02:30:05 marduk +# Added debugio module to handle debugging and IO +# +# Revision 1.7 1999/02/27 16:31:35 marduk +# Use putheader("User-Agent:"...) instead of putrequest(...) +# +# Revision 1.6 1999/02/26 01:55:08 marduk +# ACCEPTS all mime types in mimetypes.py +# +# Identify itself as Linbot x.x in HEAD requests +# +# Revision 1.5 1999/02/21 16:39:51 marduk +# 1.0b8 +# +# Revision 1.4 1999/01/10 21:58:19 marduk +# Changed self.* to link.* @line 86 in httplink.py +# +# Revision 1.3 1999/01/10 01:02:16 marduk +# Linbot 1.0b6 +# +# Revision 1.2 1998/12/31 03:49:14 marduk +# This is linbot 1.0b5. See CHANGES +# +# Revision 1.1.1.1 1998/12/20 23:17:12 marduk +# initial 1.0 +# + +# $Id: linbot.py,v 1.8 1999/03/12 04:56:01 marduk Exp $ +# Revision 1.8 1999/03/12 04:56:01 marduk +# Added ability to warn of old versions of linbot/plugins +# Added patch to enable file:// to work with OS/2 +# +# Revision 1.7 1999/03/12 01:48:14 marduk +# Preliminary support for authenticating proxies added. +# Added Andrea's redirect-depth patch. +# +# Revision 1.6 1999/03/11 04:51:25 marduk +# Added version module. +# +# Revision 1.5 1999/03/11 02:29:51 marduk +# Added debugio module to handle debugging and IO +# +# Revision 1.4 1999/02/21 16:39:25 marduk +# 1.0b8 +# +# Revision 1.3 1999/01/10 01:01:44 marduk +# Linbot 1.0b6 +# +# Revision 1.2 1998/12/23 07:34:59 marduk +# Fixed problem in linbot.py "import parser" +# +# Revision 1.1.1.2 1998/12/20 23:27:50 marduk +# This is pre 1.0, I hope @@ -0,0 +1,180 @@ + [Webcheck] + + + ------------------------------------------------------------------------ + +Installing Webcheck + +Installation is relatively easy. Note these installation instructions are +for Unix-like systems. Other operating systems may differ. + + 1. Unpack the gzipped tarchive. Be sure to add the directory to your + PYTHONPATH environment variable. + + $ tar zxvf webcheck-1.0b6.tar.gz -C /usr/local/lib + $ PYTHONPATH="/usr/local/lib/webcheck:$PYTHONPATH" + $ export PYTHONPATH + + 2. Add a symbolic link to some place in your PATH + + $ ln -s /usr/local/lib/webcheck/webcheck.py /usr/local/bin/webcheck + + 3. Edit the config.py file to your choosing. Most of the defaults are + safe. The important ones can be overridden with command-line flags. You + may want to keep a copy of the original config.py file just in case. + The config.py options are documented within the file. + + ------------------------------------------------------------------------ + +Running Webcheck + +It is simple to run Webcheck. + +Executing Webcheck without any command-line arguments will cause it to give a +simple synopsis of its usage and then quit. + +$ webcheck +webcheck [-abvq][-l url][-x url]... [-y url]... [-r depth][-o dir][-w sec][-d level] url [location]... + +Before running Webcheck on a site, you should need to do a little preparation. + +One think that Webcheck needs is a directory in which to publish its reports. +It is recommended that you choose a directory which is empty and will only +contain webcheck reports. This directory must exist and be writeable by the +user running webcheck before webcheck is run. + +$ mkdir /usr/local/apache/share/htdocs/webcheck + +The report can be viewed using most web browsers. Browsers using frames can +initially open the "index.html" file. Browsers not supporting frames or +users who do not like frames can initially open the "navbar.html" file. Note +these are default filenames for Webcheck and may be changed via the config +file. + +It should be decided beforehand which documents on your site should be +considered "internal" and which should be considered "external". Webcheck +defines internal and external documents as such: + +An internal document is a part of your site that you have control of and +checked, as well as the links that it points to. Basically an internal +document is one that, if broken, you have the power to fix. + +An external document is one that an internal document points to but you have +no jurisdiction over. It can also be a document that you have the power to +change, but need not be checked, such as documents pointed to by CGI scripts +or other automated tools such as Webcheck. + +Your base url is the url pointing to the document that is the top level of +your site. Commonly referred to as the "home page", it is the url that +points to all other urls, either directly or indirectly. The base url can be +on one web server but point to documents on another server that hosts other +internal documents. An example would be a main server +www.someplaceonthenet.com in which there may be links to an alternate server +called www2.someplaceonthenet.com. In this case www2.someplaceonthenet.com +would host internal documents even though your "home page" is on +www.someplaceonthenet.com. + +That said, you should have a basic idea of what you do and do not want +Webcheck to check. Don't be surprised if you do not get it exactly right the +first time. Also, consider using the robots.txt explained at +http://info.webcrawler.com/mak/projects/robots/exclusion-admin.html. +Currently Webcheck identifies itself as User-Agent: Webcheck. + +You can allow Webcheck to search a directory but restrict other bots, for +example, like this: + +User-agent: * +Disallow: / + +User-agent: Webcheck +Allow: / + +Okay you have heard enough and you just want to run the darn thing. The +simplest way to run Webcheck is: + +$ webcheck http://www.someplaceonthenet.com/ + +This will first read the robots.txt file at www.someplaceonthenet.com and +then proceed to examine every link pointed to on that site except documents +denied by robots.txt if that file exists. + +The exact usage for webcheck is given below. + + ------------------------------------------------------------------------ + +Synopsis + + webcheck webcheck [-abvq][-l url][-x url]... [-y url]... [-r depth][-o dir][-w + sec][-d level] url [location[:port]]... + + + + -x regexUse this option to tell Webcheck to consider any url matching regex + to be external. Uses perl-type regular expressions. Can be used + multiple times. + -y regexLike the -x flag, though this option will cause Webcheck to not + check the link matched by regex whereas -x will check the link but + not its children. Uses perl-type regular expressions. Can be used + multiple times. + -l url Use url for the logo image on all reports. The url should point to + a valid image. + -b Base urls only. Tells Webcheck to consider any url that does not + start with the base url to be considered external. For example, if + you run webcheck -b + http://www.someplaceonthenet.com/~somebody/foo.html then + http://www.someplaceonthenet.com/~somebody/misc/index.html will be + considered internal whereas http://www.someplaceonthenet.com/ will + be considered external. + -a Avoid external links. Normally if Webcheck is examining an HTML page + and it finds a link that points to an external document, it will + check to see if that external document exists. This flag disables + that action. External links will not be checked. + -q Quiet. Do not print out the progress as Webcheck traverses a site + (equivalent to -d 0). + -o dir Output directory. Use to specify the directory where Webcheck will + dump its reports. The default is the current directory or as + specified by config.py. If this directory does not exist it will + be created for you (if possible). + -r depthRedirect depth. the amount of redirects Webcheck should follow when + following a link. 0 implies follow all redirects. + -w secs Wait secs between link checking. Usually Webcheck will process a url + and immediately move on to the next. However on some loaded + systems it may be desirable to have Webcheck pause between requests. + This option can be set to any non-negative number. + -d levelSet debug level to level. For programmer-level debugging use a + level > 1. + url The base url. Webcheck checks this link first, then all the links it + points to on down the "tree". + location This specifies the hosts pointed to that are to be considered + internal. By default Webcheck only considers URLs pointing to the + host of the base url to be internal. However if your site resides + on multiple servers use this parameter to tell Webcheck what other + servers should be considered internal. May be used multiple times, + but must follow url. + ------------------------------------------------------------------------ + +Examples + +Here are some examples of running Webcheck. + +$ webcheck http://manson.ddns.org/ -x /webcheck starship.skyport.net +$ webcheck -o /stats/altavista/ http://altavista.digital.com/ +$ webcheck -o ~/Lang/Python/webcheck -b -l http://manson.ddns.org/images/marduk.gif http://manson.ddns.org/~marduk/ + ------------------------------------------------------------------------ + +Running Periodically + +Webcheck may be safely run periodically or on off-peak hours using on or at. +It may be safely run unattended. You may want to redirect Webcheck's output to +the null device, log file, or have it emailed to an account. Consult your +operating system manuals for how this can be done on your system. + + ------------------------------------------------------------------------ + +Feedback + +If you have any questions about Webcheck or would like to report a +bug, it helps a lot to include a url where the problem can be found, +an HTML file where the error occurs or a (small) tar of the site where +the error occurs. Suggestions for improvements are also welcomed. +Patches and code contributions are even better. @@ -0,0 +1,41 @@ + +Webcheck is the amazing Site Management Tool for webmasters. Downloads and more +information at: + + http://www.mired.org/webcheck/ + +Webcheck allows webmasters to: + +* View The Structure Of A Site + +* Track Down Broken Links + +* Find Potentially Outdated Web Pages + +* List Links Pointing To External Sites + +* View Portfolio Of Inline Images + +* Do All This Periodically And Without User Intervention + + + +Changes to v. 1.0 include: + +* Faster checking of sites (only downloads files when it needs). + +* Supported schemes (http, ftp, file) handled more efficiently. + +* More modular design allows other schemes to be added easily. + +* Plug-in support: third-party reports can be added to webcheck easily! + +* Themes (TM) support via Cascading StyleSheets. + +* Lots of bug fixes, including the infamous proxy bug. + +* and more! + +Webcheck is a FREE clone of Linkbot <URL: +http://www.watchfire.com/solutions/linkbot.asp > and incorporates many +of Linkbot's features as well as enhancements of its own. @@ -0,0 +1,24 @@ + + +************************************************************************** +* I'm running out of ideas ;-). If you have any suggestions for * +* improvement, please let me know. mwm@mired.org * +************************************************************************** + +Support for authenticating proxies + +New config file format. + +Configurable time-out when retrieving a document. + +Cookies support (maybe) + +Integration with weblint + +If using 'file' scheme, clicking on a link will bring up the file in an editor + +Support for mult-threading (maybe) + +Option to put # of hits of a document in the Site Map obtained from log file. + +Export to database file. diff --git a/config.py b/config.py new file mode 100644 index 0000000..54a5746 --- /dev/null +++ b/config.py @@ -0,0 +1,157 @@ +""" + + Webcheck Configuration file + Edit this file to your choosing. This is just a regular Python module, so + if you want to do something fancy with it, go right ahead. Just make sure + that all variables are defined and have an appropriate value . + +""" + + +# if this is true, webcheck will consider external all links that do not start in +# the same directory level as the base url. For example, given +# webcheck http://www.myhost.com/~me/ +# 'http://www.myhost.com/~me/stuff/index.html' would be considered internal while +# 'http://www.myhost.com/index.html' would be considered external. +# The default is false (0). note this can be overriden with the -b command-line +# flag +BASE_URLS_ONLY=0 + +# This is a (Python) list of URLs that should not be explored. This can also +# be passed to webcheck via the -x command line switch. Note this should be a +# VALID REGULAR EXPRESSION. See also YANKED_URLS below. +EXCLUDED_URLS = [r'.*\.gif',r'.*\.tar\.gz',r'.*\.jpeg',r'.*\.jpg', + r'http://www.mired.org/cgi-bin/', r'http://www.mired.org/ATCPFAQ/'] + +# This is like EXLUDED_URLS, but YANKED_URLS are not checked at all. Also +# you can use the -y command line switch. +# When using the below parameter, make sure that the regular expressions are +# raw Python strings (beginning quote preceded with an "r"). Regular expressions +# are case insensitive. +YANKED_URLS = [r'http://www.amazon.com/exec/obidos/', + r'http://www.mired.org/home/mwm/&me;.txt'] + +# Normally webcheck will check links to "external" sites at the top level to +# ensure that your pages don't refer to broken links that are not at your +# site. However, you may not want this. Setting this option to 1 will cause +# webcheck to not check external links. Note a link that is part of the. This +# can also be set with the command-line -a switch +# +# EXCLUDED_URLS list is considered external +AVOID_EXTERNAL_LINKS = 0 + +# Currently, Webcheck can checks http:, ftp:, and file:, schemes. However, you may +# want to avoid certain schemes (such as file: or ftp:). Remove the scheme +# from this list and Webcheck will avoid it. Avoided URLs are treated as external +# Default is to not avoid any. +# Examples: +#SCHEMES = ['http'] +#SCHEMES = ['http','ftp','file'] +SCHEMES = ['http','ftp'] + + + +# You can define proxies for the individual schemes above. The PROXIES config +# variable is a python dictionary or 'None', for example: +# PROXIES = {'http':'http://localhost:3128'} +PROXIES = None +# Note: according to the urllib documentation, you should also be able to set +# proxies according to your system's environment variables, for example: +# $ HTTP_PROXY='http://localhost:3128' ; export HTTP_PROXY # using Bourne Shell +# $ FTP_PROXY='http://localhost:3128' ; export FTP_PROXY +# proxies in the configuration take precedence over environment settings + + +# hostnames (for example, www.myhost.com) which are to be considered local to +# your site. Note that by default, the base URL of your site is considered +# local. This can also be passed via command-line (see documentation for details +HOSTS = ['www.mired.org','mired.org'] + + +# Directory where files generated by webcheck will be placed. This can also be +# specified via the -o command-line flag. +OUTPUT_DIR = '.' + +# When listing a broken link in it's published report, Webcheck can either make the +# broken link 'active' or simply list the URL. Most users will probably not +# want the broken link to be active. +ANCHOR_BAD_LINKS = 1 + +# Usually, Webcheck will processs a URL and immediately move on to the next one. +# However, on some loaded systems, it may be more desirable to have Webcheck wait +# a while between requests. This option should be set to any non-negative number +# (in seconds). This can also be set using the command-line -w <secs> flag. +WAIT_BETWEEN_REQUESTS = 0 + +# When Webcheck encounters a 301 or 302 response from the server, it +# needs to decide how many times it will follow the indications of the +# server. By setting this option, you may change it to your +# tastes. Setting it to -1 means "infinite redirection" (don't say I +# didn't warn you, when your sysadm tries to make you eat the 10^6 +# network logs you produced and he printed... :) +REDIRECT_DEPTH = 5 + +# Webcheck has the option of checking a registry and determine it is the +# latest version of Webcheck as well as plugin reports you are using. If +# this option is set to true (not 0) it will check the registry and print a +# message on the reports to notify you along with a link to where you +# can download the latest version of the plugin (or Webcheck). Note that +# this feature requires that Webcheck have access to the Internet +# +# **** THIS FEATURE IS CURRENTLY NONFUNCTIONAL **** +WARN_OLD_VERSION = 0 + +# Debug level. For normal output, set to 1. The higher the number, the more +# output. A setting of 0 produces no output. +DEBUG_LEVEL = 1 + +################ The section below is for report plugins ################ + +# This is the list of report plugins to display. The elements are strings and +# there should be a corresponding .py file in the WEBCHECKHOME/reports directory +# else bad things will occur ;-). Place in the order for which you would like to +# see them in the navigation bar. +# Note: Do not include the 'problems' report as it will appear (last) on all +# reports automatically +PLUGINS = ['sitemap', + 'badlinks', + 'images', + 'whatsold', + 'whatsnew', + 'slow', + 'notitles', + 'external', + 'notchkd'] + +# This is a URL (absolute or relative) of a level 1 Cascading Stylesheet to be +# used in all reports. See the default webcheck.css as well as the HTML source +# for ideas on making your own .css for Webcheck. +STYLESHEET = '' + +##### The Navigation (menu) frame/page ############ +NAVBAR_FILENAME = 'navbar.html' +NAVBAR_WIDTH = '150' +NAVBAR_PADDING = 4 +NAVBAR_SPACING = 0 + +MAIN_FILENAME = 'index.html' + +# url to logo (image) shown on all pages. If you change this you will also +# want to change the LOGO_ALT option below +LOGO_HREF="http://www.mired.org/webcheck/webcheck.gif" + +##### Configuratin for specific plugins ##### +REPORT_SITEMAP_LEVEL = 5 # How many levels deep to display the site map + +# number of columns in thumbnail image page +REPORT_IMAGES_COLS=5 +# width of thumbnail images +REPORT_IMAGES_WIDTH=100 +# height of thumbnail images +REPORT_IMAGES_HEIGHT=100 + +REPORT_WHATSOLD_URL_AGE = 700 +REPORT_WHATSNEW_URL_AGE = 7 + +REPORT_SLOW_URL_SIZE = 76 + diff --git a/contrib/plugins/about.py b/contrib/plugins/about.py new file mode 100644 index 0000000..470d02e --- /dev/null +++ b/contrib/plugins/about.py @@ -0,0 +1,47 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Plugins used in this report""" + +# This is a trivial plugin aid developers of linbot pluggins + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = "About Plugins" + +def generate(): + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + print '<tr><th>Plugin</th><th>Version</th><th>Author</th></tr>' + for plugin in config.PLUGINS + ['problems']: + report = __import__('plugins.%s' % plugin,globals(),locals(),[plugin]) + author = report.__author__ + version = report.__version__ + print '<tr><td class="pluginname">%s</td>' % plugin, + print '<td class="pluginversion">%s</td>' % version, + print '<td class="pluginauthor">%s</td></tr>' % author + print '</table>' + print '</div>' diff --git a/debugio.py b/debugio.py new file mode 100644 index 0000000..3ef3d76 --- /dev/null +++ b/debugio.py @@ -0,0 +1,34 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + +"""debugio.py: debugging and input/output module + + This module contains facilities for printing to standard output. The use + of this module is really simple: import it, set DEBUG_LEVEL, and use write() + whenever you want to print something. The print function will print to + standard output depending on DEBUG_LEVEL. +""" +import sys + +DEBUG_LEVEL=1 + +def write(s, level=1, file=sys.stdout): + """Write s to stdout if DEBUG_LEVEL is >= level""" + + if DEBUG_LEVEL >= level: file.write("%s\n" % s) diff --git a/htmlparse.py b/htmlparse.py new file mode 100644 index 0000000..38e1f13 --- /dev/null +++ b/htmlparse.py @@ -0,0 +1,129 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +"""Utilites for parsing HTML and urls""" + +import htmllib +import string +import debugio + +def urlformat(url,parent=None): + """ returns a formatted version of URL, which, adds trailing '/'s, if + necessary, deletes fragmentation identifiers '#' and expands partial url's + based on parent""" + + from urlparse import urlparse, urljoin, urlunparse + + method=urlparse(url)[0] + if (method=='') and (parent != None): + url=urljoin(parent,url) + #url=basejoin(parent,url) + parsedlist = list(urlparse(url)) + parsedlist[5]='' # remove fragment + # parsedlist[4]='' # remove query string + url = urlunparse(tuple(parsedlist)) + return url + + +class MyHTMLParser(htmllib.HTMLParser): + + def __init__(self,formatter): + self.imagelist = [] + self.title = None + self.author = None + self.base = None + htmllib.HTMLParser.__init__(self,formatter) + + # override handle_image() + def handle_image(self,src,alt,*stuff): + if src not in self.imagelist: self.imagelist.append(src) + + def do_frame(self,attrs): + for name, val in attrs: + if name=="src": + self.anchorlist.append(val) + + def save_bgn(self): + self.savedata = '' + + + def save_end(self): + data = self.savedata + self.savedata = None + return data + + def start_title(self, attrs): + self.save_bgn() + + def end_title(self): + #if not self.savedata: + # self.title = None + # return + self.title = string.join(string.split(self.save_end())) + + def do_meta(self,attrs): + fields={} + for name, value in attrs: + fields[name]=value + if fields.has_key('name'): + if string.lower(fields['name']) == 'author': + if fields.has_key('content'): + author = fields['content'] + self.author = author + debugio.write('\tauthor: ' + author) + + # <AREA> for client-side image maps + def do_area(self,attrs): + for name, val in attrs: + if name=="href": + if val not in self.anchorlist: + self.anchorlist.append(val) + + def do_base(self,attrs): + for name,val in attrs: + if name=="href": + self.base = val + +def pageLinks(url,page): + """ returns a list of all the url's in a page. page should be a file object + Partial urls will be expanded using <url> parameter unless the page contains + the <BASE HREF=> tag.""" + import htmllib + from formatter import NullFormatter + + parser = MyHTMLParser(NullFormatter()) + parser.feed(page) + parser.close() + urllist = [] + imagelist = [] + + title = parser.title + author = parser.author + if parser.base is not None: + parent = parser.base + else: + parent = url + for anchor in parser.anchorlist: + anchor=urlformat(anchor,parent) + if anchor not in urllist: urllist.append(anchor) + + for image in parser.imagelist: + image=urlformat(image,parent) + if image not in imagelist: imagelist.append(image) + + return (urllist, imagelist, title, author) diff --git a/httpcodes.py b/httpcodes.py new file mode 100644 index 0000000..6060c67 --- /dev/null +++ b/httpcodes.py @@ -0,0 +1,58 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +__version__='0.10' +__author__ = 'Mike Meyer <mwm@miredo.org>' + +HTTP_STATUS_CODES= {'100':"Continue", + '101':"Switching Protocols", + '200':"OK", + '201':"Created", + '202':"Accepted", + '204':"No Content", + '205':"Reset Content", + '206':"Partial Content", + '300':"Multiple Choices", + '301':"Moved Permanently", + '302':"Moved Temporarily", + '303':"See Other", + '304':"Not Modified", + '305':"Use Proxy", + '400':"Bad Request", + '401':"Unauthorized", + '402':"Payment Required", + '403':"Forbidden", + '404':"Not Found", + '405':"Method Not Allowed", + '406':"Not Acceptable", + '407':"Proxy Authentication Required", + '408':"Request Time-out", + '409':"Conflict", + '410':"Gone", + '411':"Length Required", + '412':"Precondition Failed", + '413':"Request Entity Too Large", + '414':"Request-URI Too Large", + '415':"Unsupported Media Type", + '500':"Internal Server Error", + '501':"Not Implemented", + '502':"Bad Gateway", + '503':"Service Unavailable", + '504':"Gateway Time-out", + '505':"HTTP Version not supported" + } diff --git a/myUrlLib.py b/myUrlLib.py new file mode 100644 index 0000000..fc7804d --- /dev/null +++ b/myUrlLib.py @@ -0,0 +1,303 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Generic library for handling urls and links""" + +config = None +robot_parsers={} +SECS_PER_DAY=60*60*24 +compiled_ex = [] +compiled_yanked = [] +linkmodules={} + +from urllib import * +from types import * +import htmllib +import httplib +import robotparser +import string +# The following is to help sgmllib parse DOS/Windows text files +string.whitespace = string.whitespace + '\012\015' +import time +import re +import stat +import htmlparse +import debugio +import sys +import socket + + +def get_robots(location): + global robot_parsers + debugio.write('\tGetting robots.txt for %s' % location) + rp=robotparser.RobotFileParser(config.PROXIES) + try: + rp.set_url('http://' + location + '/robots.txt') + rp.read() + robot_parsers[location]=rp + except TypeError: + pass + +def can_fetch(location, url): + """Return true if url is allowed at location, else return 0""" + if robot_parsers.has_key(location): + return robot_parsers[location].can_fetch('Webcheck',url) + return 1 + +############################################################################ +class Link: + """ my class of url's which includes parents, HTTP status number, and + a list of URL's in that link urls. + """ + + linkList = {} + badLinks = [] + notChecked = [] + images = {} + baseurl = "" + base="" + + # This is a static variable to indicate if the config.EXCLUDED urls have been + # compiled as regular expressions. + re_compiled = 0 + + def __init__(self,url,parent): + self.init() + + debugio.write('\tparent = ' + str(parent),2) + from urlparse import urlparse + + parsed = urlparse(url) + self.scheme = parsed[0] + location = parsed[1] + + if parent not in self.parents: + if parent: self.parents.append(parent) + + self.URL = url + Link.linkList[self.URL]=self + + modname = self.scheme + 'link' + if linkmodules.has_key(modname): linkmodule = linkmodules[modname] + else: + try: + linkmodule = linkmodules[modname] = __import__('schemes.'+modname, globals(),locals(),[modname]) + except ImportError: + self.status="Not Checked" + self.external=1 + self.URL=url + Link.notChecked.append(self.URL) + Link.linkList[self.URL]=self + debugio.write('\tNot checked: URL scheme ' + self.scheme + ' ignored.') + return + + if (parent is None): + Link.baseurl=self.URL + if hasattr(self.URL, 'rfind'): + Link.base=self.URL[:self.URL.rfind('/')+1] + else: + Link.base=self.URL[:string.rfind(self.URL,'/')+1] + if Link.base[-2:] == '//': Link.base = self.URL + debugio.write('\tbase: %s' % Link.base) + if self.scheme == 'http': + base_location = parsed[1] + if base_location not in config.HOSTS: + config.HOSTS.append(base_location) + if not robot_parsers.has_key(location): + try: + get_robots(location) + except IOError: + pass + + # see if robots.txt will let us in + if self.scheme == 'http': + if not can_fetch(location, url): + debugio.write('\tRobot Restriced') + self.status = 'Not Checked' + self.message = 'Robot Restricted' + Link.notChecked.append(url) + return + + try: + linkmodule.init(self, url, parent) + if (self.URL not in Link.badLinks) and (self.type == 'text/html'): + page = linkmodule.get_document(self.URL) + self._handleHTML(self.URL, page) + except IOError, data: + self.set_bad_link(url,str(data.errno) + ': ' + str(data.strerror)) + return + except socket.error, data: + if type(data) is StringType: + self.set_bad_link(url, data) + elif type(data) is TupleType: + errno, string = data + self.set_bad_link(url,str(errno) + ': ' + string) + else: + self.set_bad_link(url,str(data)) + except KeyboardInterrupt: + raise KeyboardInterrupt + except: + self.set_bad_link(url,"Error: Malformed URL?") + debugio.write("\t%s: %s" % (sys.exc_type, sys.exc_value),3) + return + + def explore_children(self): + for child in self.children: + if not Link.linkList.has_key(child): + if config.WAIT_BETWEEN_REQUESTS > 0: + debugio.write('sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS) + time.sleep(config.WAIT_BETWEEN_REQUESTS) + debugio.write("adding url: %s" % child) + if is_yanked(child): + Link.linkList[child]=ExternalLink(child,self.URL,1) + elif is_external(child) or is_excluded(child): + Link.linkList[child]=ExternalLink(child,self.URL) + else: + Link.linkList[child]=Link(child,self.URL) + elif self.URL not in Link.linkList[child].parents: + Link.linkList[child].parents.append(self.URL) + return # __init__ + + def init(self): + """ initialize some variables """ + self.age = None + self.scheme = None + self.headers = None + self.parents= [] + self.children = [] + self.status = None + self.title = None + self.external = 0 + self.html = 0 + self.size = 0 + self.totalSize = 0 + self.author = None + + def __repr__(self): + return self.URL + + def set_bad_link(self,url,status): + """ flags the link as bad """ + debugio.write('\t' + str(status)) + self.status = str(status) + self.URL=url + Link.linkList[self.URL]=self + Link.badLinks.append(self.URL) + + def _handleHTML(self,url,htmlfile): + """examines and html file and updates the Link object""" + # get anchorlist + (anchorlist, imagelist, title, author) = htmlparse.pageLinks(url,htmlfile) + + debugio.write('\ttitle: %s' % str(title)) + for child in anchorlist: + if child not in self.children: + self.children.append(child) + + self.totalSize = self.size + self.title = title + self.author = author + self.html = 1 + # get image list + for image in imagelist: + if image not in Link.images.keys(): + debugio.write('\tadding image: %s' % image) + Link.images[image] = Image(image, self.URL) + self.totalSize = self.totalSize + int(Link.images[image].size) + if not self.external: self.explore_children() + return + + + +class ExternalLink(Link): + """ this class is just like Link, but it does not explore it's children """ + + def __init__(self,url,parent,yanked=0): + + if config.AVOID_EXTERNAL_LINKS or yanked: + self.init() + self.status="Not Checked" + self.external=1 + debugio.write('\tNot checked') + if yanked: debugio.write('\tYanked') + if parent not in self.parents: + if parent: self.parents.append(parent) + Link.notChecked.append(url) + return + Link.__init__(self,url,parent) + self.external=1 + + + def _handleHTML(self,url,htmlfile): + # ignore links and images, but use the title + self.title = htmlparse.pageLinks(url,htmlfile)[2] + debugio.write('\ttitle: %s' % str(self.title)) + self.children=[] + +class Image(Link): + """ This class is just like link, but different :-)""" + def __init__(self, url, parent): + #self.init() + Link.__init__(self, url, parent) + #self.age = getAge(self) + + def _handleHTML(self,url,htmlfile): + """Don't handle HTML, this is an image""" + self.set_bad_link(url,"HTML file used in IMG tag?") + return + +def is_external(url): + """ returns true if url is an external link """ + from urlparse import urlparse + parsed = urlparse(url) + scheme = parsed[0] + location = parsed[1] + if (location not in config.HOSTS) and (scheme in ['http','ftp']): + return 1 + if config.BASE_URLS_ONLY and (Link.base!=url[:len(Link.base)]): + return 1 + return 0 + +def compile_re(): + """Compile EXCLUDED URLSs and set flag""" + global compiled_ex + for i in config.EXCLUDED_URLS: + debugio.write('compiling %s' % i,3) + compiled_ex.append(re.compile(i,re.IGNORECASE)) + for i in config.YANKED_URLS: + debugio.write('compiling %s' % i,3) + compiled_yanked.append(re.compile(i,re.IGNORECASE)) + Link.re_compiled = 1 + +def is_excluded(url): + """ Returns true if url is part of the EXCLUDED_URLS list """ + if not Link.re_compiled: + compile_re() + for x in compiled_ex: + if x.search(url) is not None: + return 1 + return 0 + +def is_yanked(url): + """ Returns true if url is part of YANKED_URLS list""" + if not Link.re_compiled: + compile_re() + for x in compiled_yanked: + if x.search(url) is not None: + return 1 + return 0 + diff --git a/plugins/__init__.py b/plugins/__init__.py new file mode 100644 index 0000000..2a586eb --- /dev/null +++ b/plugins/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + diff --git a/plugins/badlinks.py b/plugins/badlinks.py new file mode 100644 index 0000000..ef1f229 --- /dev/null +++ b/plugins/badlinks.py @@ -0,0 +1,56 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Listing of bad links""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'Bad Links' + +def generate(): + print '<div class="table">' + print '<table border=0 cellspacing=2 width="75%">' + for link in Link.badLinks: + print '\t<tr><td class="blank" colspan=3> </td></tr>' + if config.ANCHOR_BAD_LINKS: + print '\t<tr class="link"><th>Link</th>', + print '<td colspan=2 align=left>' +make_link(link,link) +'</td></tr>' + else: + print '\t<tr class="link"><th>Link</th>', + print '<td colspan=2 align=left>%s</td></tr>' % link + status = str(linkList[link].status) + if status in HTTP_STATUS_CODES.keys(): + status = status + ": " + HTTP_STATUS_CODES[status] + print '\t<tr class="status"><th>Status</th><td colspan=2>%s</td></tr>' % status + print '\t<tr class="parent"><th rowspan="%s">Parents</th>' % len(linkList[link].parents) + parents = linkList[link].parents + parents.sort(sort_by_author) + for parent in parents: + print '\t\t<td>%s</td>' % make_link(parent,get_title(parent)), + print '<td>%s</td>\n\t</tr>' % (str(linkList[parent].author)) + add_problem("Bad Link: " + link,linkList[parent]) + print '</table>' + print '</div>' diff --git a/plugins/external.py b/plugins/external.py new file mode 100644 index 0000000..44e11f3 --- /dev/null +++ b/plugins/external.py @@ -0,0 +1,40 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""External links""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'External Links' + +def generate(): + print '<ol>' + for url in linkList.keys(): + link=linkList[url] + if link.external: + print '\t<li>%s' % make_link(url,get_title(url)) + print '</ol>' diff --git a/plugins/images.py b/plugins/images.py new file mode 100644 index 0000000..7d65d0b --- /dev/null +++ b/plugins/images.py @@ -0,0 +1,58 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Image Catalog""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'Images' + +# images +def generate(): + import math + imagelist=Link.images.keys() + + currentPic=0 + rows = int(math.ceil(len(imagelist)/config.REPORT_IMAGES_COLS))+1 + print '<div class="table">' + print '<table border=0 cellspacing="1" cellpadding="0">' + + for row in range(rows): + print'\t<tr>' + for col in range(config.REPORT_IMAGES_COLS): + if currentPic==len(imagelist): break + image=imagelist[currentPic] + print '\t\t<td>' + \ + make_link(image, + '<img src="%s" width="%d" height="%d" alt="%s">' \ + % (image,config.REPORT_IMAGES_WIDTH, + config.REPORT_IMAGES_HEIGHT, image)), + print '</td>' + currentPic = currentPic + 1 + + print '\t</tr>' + print '</table>' + print '</div>' diff --git a/plugins/notchkd.py b/plugins/notchkd.py new file mode 100644 index 0000000..d9c08d0 --- /dev/null +++ b/plugins/notchkd.py @@ -0,0 +1,46 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Pages which were not checked""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'Not Checked' + +def generate(): + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + for url in Link.notChecked: + print '\t<tr><th colspan=4>%s</th></tr>' % make_link(url,url) + print '\t<tr class="parent"><th rowspan="%s">Parent</th>' % len(linkList[url].parents) + for parent in linkList[url].parents: + print '\t\t', + if parent != linkList[url].parents[0]: print '<tr>', + print '<td colspan=2>%s</td>' % make_link(parent,get_title(parent)), + print '<td>%s</td></tr>' % (linkList[parent].author) + print '\n\t<tr><td class="blank" colspan=4> </td></tr>\n' + print '</table>' + print '</div>' diff --git a/plugins/notitles.py b/plugins/notitles.py new file mode 100644 index 0000000..aba829a --- /dev/null +++ b/plugins/notitles.py @@ -0,0 +1,47 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Pages with no titles""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'No Titles' + +def generate(): + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + print '\t<tr><th>URL</th><th>Author</th></tr>' + urls = linkList.keys() + urls.sort(sort_by_author) + for url in urls: + link = linkList[url] + if link.external: continue + if link.html and (link.title is None): + print '\t<tr><td>%s</td><td>%s</td></tr>' \ + % (make_link(url,url), link.author) + add_problem("No Title",link) + print '</table>' + print '</div>' diff --git a/plugins/problems.py b/plugins/problems.py new file mode 100644 index 0000000..2a42f99 --- /dev/null +++ b/plugins/problems.py @@ -0,0 +1,53 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Breakdown of links with problems""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = 'Problems (By Author)' + +def generate(): + authors=problem_db.keys() + authors.sort() + if len(authors) > 1: + print '<p class="authorlist">' + for author in authors[:-1]: + print '<a href="#%s">%s</a>' % (author, author), + print " | " + print '<a href="#%s">%s</a>' % (authors[-1], authors[-1]), + print '</p>' + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + for author in authors: + print '<tr><th><a name="%s">%s</a></th></tr>' % (author,author) + for type,link in problem_db[author]: + url=`link` + title=get_title(url) + print '<tr><td>%s <br>%s</td></tr>' % (make_link(url,title), type) + print '<tr><td class="blank"> </td></tr>\n' + print '</table>' + print '</div>' diff --git a/plugins/rptlib.py b/plugins/rptlib.py new file mode 100644 index 0000000..101a56e --- /dev/null +++ b/plugins/rptlib.py @@ -0,0 +1,290 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import sys +import webcheck +import urllib +import string +import os +import debugio +import version + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config +proxies = config.PROXIES + +problem_db = {} + +# get the stylesheet for insertion, +# Note that I do it this way for two reasons. One is that Netscape reportedly +# handles stylesheets better when they are inlined. Two is that people often +# forget to put webcheck.css in the output directory. +if proxies is None: + proxies = urllib.getproxies() +opener = urllib.FancyURLopener(proxies) +opener.addheaders = [('User-agent','Webcheck ' + version.webcheck)] +try: + stylesheet = opener.open(config.STYLESHEET).read() +except: + stylesheet = '' + +def get_title(url): + """ returns the title of a url if it is not None, else returns url + note that this implies linkList[url] """ + link=linkList[url] + if link.title is None: + return url + return link.title + +def make_link(url,text): + """Return an <A>nchor to a url with <text>. If url is in the Linklist and + is external, insert "class=external" in the <A> tag.""" + url = str(url) # because sometimes I lazily pass a Link object. + mystring = '<a href="' + url + '"' + try: + external = linkList[url].external + except KeyError: + external = 0 + if external: + mystring = mystring + ' class="external"' + else: + mystring = mystring + ' class="internal"' + mystring = mystring + '>' + text + '</a>' + return mystring + +def add_problem(type,link): + """ add a problem to the 'problems' database. Will not add external links""" + if link.external: return + global problem_db + author = link.author + if problem_db.has_key(author): + problem_db[author].append((type,link)) + else: + problem_db[author]=[(type,link)] + +def sort_by_age(a,b): + """ sort helper for url's age. a and b are urls in linkList """ + aage, bage = linkList[a].age, linkList[b].age + if aage < bage: + return -1 + if aage == bage: + return sort_by_author(a,b) + return 1 + +def sort_by_rev_age(a,b): + aage, bage = linkList[a].age, linkList[b].age + if aage > bage: + return -1 + if aage == bage: + return sort_by_author(a,b) + return 1 + +def sort_by_author(a,b): + aauthor,bauthor = `linkList[a].author`, `linkList[b].author` + if aauthor < bauthor: + return -1 + if aauthor == bauthor: + return 0 + return 1 + +def sort_by_size(a,b): + asize, bsize = linkList[a].totalSize, linkList[b].totalSize + if asize < bsize: + return 1 + if asize == bsize: + return 0 + return -1 + +def main_index(): + tmp = sys.stdout + fp = open_file(config.MAIN_FILENAME) + sys.stdout=fp + + print '<html>' + print '<head>' + print '<title>Webcheck report for "%s"</title>' % get_title(`Link.base`) + print '<style type="text/css">' + print '<!-- /* hide from old browsers */' + print stylesheet + print ' --> </style>' + print '</head>' + print '<frameset COLS="%s,*" border=0 framespacing=0>' \ + % config.NAVBAR_WIDTH + print '<frame name="navbar" src="%s" marginwidth=0 marginheight=0 frameborder=0>' \ + % config.NAVBAR_FILENAME + print '<frame name="main" src="%s" frameborder=0>' % (webcheck.plugins[0]+'.html') + print '</frameset>' + print '</html>' + fp.close() + sys.stdout = tmp + + +def nav_bar(plugins): + # navigation bar + fp=open_file(config.NAVBAR_FILENAME) + stdout = sys.stdout + sys.stdout = fp + print '<html>\n<head>' + print '\t<title>navbar</title>' + print '<style type="text/css">' + print '<!-- /* hide from old browsers */' + print stylesheet + print ' --> </style>' + print '\t<base target="main">' + print '</head>' + print '<body class="navbar">' + print '<div align=center>' + print '<table cellpadding="%s" cellspacing="%s">' \ + % (config.NAVBAR_PADDING, config.NAVBAR_SPACING) + # title + print '<tr><th class="home">', + print '<a target="_top" href="%s" onMouseOver="window.status=\'Webcheck Home Page\'; return true;">Webcheck %s</a></th></tr>' \ + % (version.home, version.webcheck) + + # labels pointing to each individual page + for plugin in plugins + ['problems']: + debugio.write('\t' + plugin,file=stdout) + filename = plugin + '.html' + print '<tr><th>', + report = __import__('plugins.' + plugin, globals(), locals(), [plugin]) + print '<strong><a href="%s" onMouseOver="window.status=\'%s\'; return true">%s</a></strong>' \ + % (filename, report.__doc__, report.title), + print '</th></tr>' + + # create the file we just pointed to + tmp = sys.stdout + fp = open_file(filename) + sys.stdout = fp + doTopMain(report) + report.generate() + report_version = report.__version__ + if config.WARN_OLD_VERSION: + check_and_warn(plugin,report_version) + doBotMain() + fp.close() + sys.stdout = tmp + + print + print '</table>' + print '</div>' + print '</body>' + print '</html>' + + fp.close() + sys.stdout = stdout + +def open_file(filename): + """ given config.OUTPUT_DIR checks if the directory already exists; if not, it creates it, and then opens filename for writing and returns the file object """ + if os.path.isdir (config.OUTPUT_DIR) == 0: + os.mkdir(config.OUTPUT_DIR) + return open(config.OUTPUT_DIR + filename,'w') + +def doTopMain(report): + """top part of html files in main frame prints to stdout""" + print '<html>' + print '<head><title>%s</title>' % report.title + print '<style type="text/css">' + print '<!-- /* hide from old browsers */' + print stylesheet + print ' --> </style>' + print '<meta name="Author" content="Webcheck ' + version.webcheck + '">' + print '</head>' + print '<body class="%s">' % string.split(report.__name__,'.')[1] + print '<p class="logo"><a ' + print 'href="%s"><img src="%s" border=0 alt=""></a></p>' % (Link.base, config.LOGO_HREF) + print '\n<h1 class="basename">' + print '\t<a href="%s">%s</a>' \ + % (`Link.base`, get_title(`Link.base`)) + print '</h1>' + print '\n\n<table width="100%" cellpadding=4>' + print '\t<tr><th class="title">%s</th></tr>\n</table>\n' % report.title + +def doBotMain(): + """ bottom part of html files in main frame""" + print + print '<hr>' + print '<p class="footer">' + print '<em>Generated %s by <a target="_top" href="%s">Webcheck %s</a></em></p>' \ + % (webcheck.start_time,version.home, version.webcheck) + print '</body>' + print '</html>' + + +def read_registry(url): + """Read file referenced by url and return a registry object. + + The registry object is just a dictionary. The key an individual + module name. The value is a tuple consisting of the latest version + and the url where it can be retrieved. e.g.: + registry['mymodule'] = ('1.0','http://www.mymodule.com/') + """ + registry = {} + lines = opener.open(url).readlines() + opener.close() + for line in lines: + fields = string.split(line) + if len(fields) != 3: continue + registry[fields[0]] = fields[1:] + + return registry + +def check_and_warn(plugin,plugin_version): + """Check to see if Webcheck and plugin are up to date if so write it in + the report. + """ + + old_webcheck = 0 + old_plugin = 0 + + # first check to see if webcheck is up to date + try: + if version.webcheck != registry['webcheck'][0]: + old_webcheck = 1 + except KeyError: + pass + try: + if plugin_version != registry[plugin][0]: + old_plugin = 1 + except KeyError: + pass + + if (old_plugin + old_webcheck): + print '<table class="warning" cellpadding="4" cellspacing="0" border="0">' + print '<tr><td><strong>Warning:</strong> ', + if old_webcheck: + print 'The version of Webcheck you are using (%s) is outdated.' \ + % version.webcheck, + print 'You may download the latest version, %s, at ' \ + % registry['webcheck'][0], + print '<a href="%s" target="_top">%s</a>.<br><br>' \ + % (registry['webcheck'][1],registry['webcheck'][1]) + if old_plugin: + print 'The %s plugin used to generate this report is outdated.' \ + % plugin, + print 'This version is %s. The latest version is %s ' \ + % (plugin_version, registry[plugin][0]), + print 'And may be downloaded at <a href="%s" target="_top">%s</a>.<br>' \ + % (registry[plugin][1],registry[plugin][1]) + print '</td></tr></table>' + +if config.WARN_OLD_VERSION: + registry = read_registry(version.registry) + debugio.write('registry = %s' % registry,4) diff --git a/plugins/sitemap.py b/plugins/sitemap.py new file mode 100644 index 0000000..8338e4e --- /dev/null +++ b/plugins/sitemap.py @@ -0,0 +1,79 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Your site at-a-glance""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from rptlib import * + +title = 'Site Map' +level = 0 + +def explore(link, explored): + """Recursively do a breadth-first traversal of the graph of links + on the site. Returns a list of HTML fragments that can be printed + to produce a site map.""" + + global level + if level > webcheck.config.REPORT_SITEMAP_LEVEL: return [] + # XXX I assume an object without a .URL is something + # uninteresting? --amk + if not hasattr(link, 'URL'): return [] + + level=level+1 + explored[ link.URL ] = 1 + to_explore = [] + L = ['<ul>'] + + # We need to do a breadth-first traversal. This requires two + # steps for any given page. First, we need to make a list of + # links to be traversed; links that have already been explored can + # be ignored. + + for i in link.children: + # Skip pages that have already been traversed + if explored.has_key( i ): continue + if (i in webcheck.Link.badLinks) and not webcheck.config.ANCHOR_BAD_LINKS: + L.append('<li>%s' % i) + else: + to_explore.append(i) + explored[ i ] = 1 # Mark the link as explored + + # Now we loop over the list of links; the traversal will not go to + # any pages that are marked as having already been traversed. + for i in to_explore: + child = webcheck.Link.linkList[i] + L.append('<li>%s' % (make_link(i,get_title(i)))) + L = L + explore(child, explored) + + L.append( '</ul>' ) + level=level-1 + + # If no sub-pages were traversed at all, just return an empty list + # to avoid redundant <UL>...</UL> pairs + if len(L) == 2: return [] + + return L + +# site map +def generate(): + print make_link(webcheck.Link.base,'Starting Page') + L = explore(webcheck.Link.base, {}) + for i in L: print i diff --git a/plugins/slow.py b/plugins/slow.py new file mode 100644 index 0000000..ab18d2f --- /dev/null +++ b/plugins/slow.py @@ -0,0 +1,61 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Pages that are slow to download""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = "What's Slow" + +def generate(): + import time + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + print '\t<tr><th rowspan=2>Link</th>', + print '<th rowspan=2>Size <br>(Kb)</th>', + print '<th colspan=3>Time (HH:MM:SS)</th></tr>' + print '\t<tr><th>28.8</th><th>ISDN</th><th>T1</th></tr>' + + urls = linkList.keys() + urls.sort(sort_by_size) + for url in urls: + link = linkList[url] + if not link.html: continue + sizeK = link.totalSize / 1024 + sizek = link.totalSize * 8 / 1000 + if sizeK < config.REPORT_SLOW_URL_SIZE: + break + print '\t<tr><td>%s</td>' % make_link(url, get_title(url)), + print '<td>%s</td><td class="time">%s</td>' \ + % (sizeK, time.strftime('%H:%M:%S',time.gmtime(int(sizek/28.8)))), + print '<td class="time">%s</td>' \ + % time.strftime('%H:%M:%S',time.gmtime(int(sizek/56))), + print '<td class="time">%s</td>' \ + % time.strftime('%H:%M:%S',time.gmtime(int(sizek/1500))), + print '</tr>' + add_problem('Slow Link: %sK' % sizeK, link) + print '</table>' + print '</div>' diff --git a/plugins/whatsnew.py b/plugins/whatsnew.py new file mode 100644 index 0000000..1c655af --- /dev/null +++ b/plugins/whatsnew.py @@ -0,0 +1,49 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Recently modified pages""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = "What's New" + +# what's new +def generate(): + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + print '\t<tr><th>Link</th><th>Author</th><th>Age</th></tr>' + urls = linkList.keys() + urls.sort(sort_by_age) + for url in urls: + link=linkList[url] + if not link.html: continue + age = link.age + if (age is not None)and (age <= config.REPORT_WHATSNEW_URL_AGE): + print '\t<tr><td>%s</td>' % make_link(url,get_title(url)), + print '<td>%s</td>' % link.author, + print '<td class="time">%s</td></tr>' % age + print '</table>' + print '</div>' diff --git a/plugins/whatsold.py b/plugins/whatsold.py new file mode 100644 index 0000000..51d1ad2 --- /dev/null +++ b/plugins/whatsold.py @@ -0,0 +1,50 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""Potentially outdated pages""" + +__version__ = '1.0' +__author__ = 'mwm@mired.org' + +import webcheck +from httpcodes import HTTP_STATUS_CODES +from rptlib import * + +Link = webcheck.Link +linkList = Link.linkList +config = webcheck.config + +title = "What's Old" + +# what's old +def generate(): + print '<div class="table">' + print '<table border=0 cellpadding=2 cellspacing=2 width="75%">' + print '\t<tr><th>Link</th><th>Author</th><th>Age</th></tr>' + urls = linkList.keys() + urls.sort(sort_by_rev_age) + for url in urls: + link=linkList[url] + if not link.html: continue + age = link.age + if age and (age >= config.REPORT_WHATSOLD_URL_AGE): + print '\t<tr><td>%s</td>' % make_link(url,get_title(url)), + print '<td>%s</td>' % (link.author), + print '<td class="time">%s</td></tr>' % age + add_problem('Old Link: %s days old' % age ,link) + print '</table>' + print '</div>' diff --git a/robotparser.py b/robotparser.py new file mode 100644 index 0000000..b479dd6 --- /dev/null +++ b/robotparser.py @@ -0,0 +1,103 @@ +""" + +Robots.txt file parser class. Accepts a list of lines or robots.txt URL as +input, builds a set of rules from that list, then answers questions about +fetchability of other URLs. + +Change made by marduk@python.net to support proxies. +RobotFileParser class can be instantiated with optional proxies parameter, +just like FancyURLopener in urllib. + +""" + +class RobotFileParser: + + def __init__(self, proxies = None): + self.proxies = proxies + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 + + def mtime(self): + return self.last_checked + + def modified(self): + import time + self.last_checked = time.time() + + def set_url(self, url): + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) + + def read(self): + import urllib + urlopener = urllib.FancyURLopener(self.proxies) + self.parse(urlopener.open(self.url).readlines()) + + def parse(self, lines): + import re, string + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = re.split(' *: *', line) + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(re.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line + + self.modified() + + # returns true if agent is allowed to fetch url + def can_fetch(self, agent, url): + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path): + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + +def test(): + rp = RobotFileParser() + rp.debug = 1 + rp.set_url('http://www.automatrix.com/robots.txt') + rp.read() + print rp.rules + print rp.can_fetch('*', 'http://www.calendar.com/concerts/') + print rp.can_fetch('Musi-Cal-Robot', + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') diff --git a/schemes/__init__.py b/schemes/__init__.py new file mode 100644 index 0000000..4b915ca --- /dev/null +++ b/schemes/__init__.py @@ -0,0 +1,18 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# hi mom diff --git a/schemes/filelink.py b/schemes/filelink.py new file mode 100644 index 0000000..0c0cb7c --- /dev/null +++ b/schemes/filelink.py @@ -0,0 +1,57 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""This module defines the functions needed for creating Link objects for urls +using the file scheme""" + +import urlparse +import os +import time +import mimetypes +import myUrlLib +import re + +mimetypes.types_map['.shtml']='text/html' + +def init(self, url, parent): + self.URL = myUrlLib.basejoin(parent,url) + parsed = urlparse.urlparse(self.URL,'file',0) + filename = parsed[2] + if os.name != 'posix': + filename = re.sub("^/\(//\)?\([a-zA-Z]\)[|:]","\\2:",filename) + try: + stats = os.stat(filename) + except os.error: + self.set_bad_link(self.URL, "No such file or directory") + return + + self.size = stats[6] + + lastMod = stats[8] + self.age = int((time.time()-lastMod)/myUrlLib.SECS_PER_DAY) + + self.type = mimetypes.guess_type(url)[0] + if self.type is None: self.type = 'application/octet-stream' # good enough? + +def get_document(url): + parsed = urlparse.urlparse(url,'file',0) + filename = parsed[2] + if os.name != 'posix': + filename = re.sub("^/\(//\)?\([a-zA-Z]\)[|:]","\\2:",filename) + + return open(filename,'r').read() + diff --git a/schemes/ftplink.py b/schemes/ftplink.py new file mode 100644 index 0000000..8f09d0a --- /dev/null +++ b/schemes/ftplink.py @@ -0,0 +1,125 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 1998,1999 Mike Meyer <mwm@mired.org> + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""This module defines the functions needed for creating Link objects for urls +using the ftp scheme""" + +import urllib +import mimetypes +import ftplib +import urlparse +import myUrlLib +import string +import posixpath +import debugio + +Link = myUrlLib.Link + +def init(self, url, parent): + + self.URL = myUrlLib.basejoin(parent,url) + self.type = mimetypes.guess_type(url)[0] + + host, port, user, passwd, pathname = parseurl(url) + try: + ftp = ftplib.FTP(host,user,passwd) + stat(pathname, ftp) + except ftplib.all_errors, errtext: + self.set_bad_link(self.URL, str(errtext)) + return + + self.size = size(pathname,ftp) + if self.size is None: self.size = 0 + +def callback(line): + """Read a line of text and do nothing with it""" + return + +def stat(pathname, ftpobject): + # This is not completely implemented + # Note: ftp servers do not respond with a 5xx error when a file does not + # exist except for GET, which I'm trying to GET around ;-) Anyway, an + # error code will be reported if you try to change to a directory that + # does not exist, so this is not totally useless + # In addition to the above, all of the ftp servers i tested this on + # did not report the correct code (211,212,213) when responding to STAT + # per RFC959. What the hell is up with that? Can checking ftp links be + # done reliably? + # FTP should be replaced by a new protocol that produces machine-readable + # responses and actually lets you get the status of a file without having to + # download it. Oh wait, that's what HTTP is. + dirs, filename = split_dirs(pathname) + cwd(dirs, ftpobject) + response = ftpobject.retrlines('NLST %s' % filename,callback) + debugio.write(response,2) + +def get_document(url): + host, port, user, passwd, pathname = parseurl(url) + dirs, filename = split_dirs(pathname) + ftp = ftplib.FTP(host,user,passwd) + cwd(dirs, ftp) + return ftp.retrbinary('RETR %s' % filename) + +def split_dirs(pathname): + """Given pathname, split it into a tuple consisting of a list of dirs and + a filename""" + + dirs, filename = posixpath.split(pathname) + dirs = string.split(dirs,'/') + if dirs[0] == '': dirs[0] = '/' + if not filename: + filename = dirs[-1] + dirs = dirs[:-1] + return (dirs, filename) + +def size(pathname,ftpobject): + if pathname == '': pathname = '/' + dirs, filename = split_dirs(pathname) + debugio.write('pathname =%s' % pathname,3) + debugio.write('dirs= %s' % dirs,3) + debugio.write('filename= %s' % filename,3) + cwd(dirs, ftpobject) + return ftpobject.size(filename) + +def cwd(dirs, ftpobject): + for dir in dirs: + ftpobject.cwd(dir) + +def parseurl(url): + parsed = urlparse.urlparse(url) + host = parsed[1] + if '@' in host: + userpass, host = string.split(host,'@') + if ':' in userpass: + user, passwd = string.split(userpass,':') + else: + user = userpass + passwd = None + else: + user = 'anonymous' + # this is bad, i'll change it later + passwd = 'mwm@mired.org' + + if ':' in host: + host, port = string.split(host,':') + port = int(port) + else: + port = ftplib.FTP_PORT + + pathname = parsed[2] + if not port: port = ftplib.FTP_PORT + return (host, port, user, passwd, pathname) diff --git a/schemes/httplink.py b/schemes/httplink.py new file mode 100644 index 0000000..c792d84 --- /dev/null +++ b/schemes/httplink.py @@ -0,0 +1,167 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +"""This module defines the functions needed for creating Link objects for urls +using the http scheme""" + +import myUrlLib +import string +import httplib +import urllib +import time +import urlparse +import base64 +import mimetypes +import debugio +import version + +config = myUrlLib.config +Link = myUrlLib.Link +proxies = config.PROXIES +if proxies is None: + proxies = urllib.getproxies() +redirect_depth = 0 + +opener = urllib.FancyURLopener(proxies) +opener.addheaders = [('User-agent','Webcheck ' + version.webcheck)] + +def get_reply(url): + """Open connection to url and report information given by HEAD command""" + + global redirect_depth + parsed = urlparse.urlparse(url) + if proxies and proxies.has_key('http'): + host = urlparse.urlparse(proxies['http'])[1] + document = url + + else: + host = parsed[1] + document = string.join(parsed[2:4],'') + + if not document: document = '/' + debugio.write('document= %s' % document,3) + + (username, passwd, realhost, port) = parse_host(host) + + h = httplib.HTTP() + if port: + h.connect(realhost, port) + else: + h.connect(realhost) + + h.putrequest('HEAD', document) + if username and passwd: + auth = string.strip(base64.encodestring(username + ":" + passwd)) + h.putheader('Authorization', 'Basic %s' % auth) + h.putheader('User-Agent','Webcheck %s' % version.webcheck) + h.putheader('Host',realhost) + + h.endheaders() + + errcode, errmsg, headers = h.getreply() + h.close() + debugio.write(errcode,2) + debugio.write(errmsg,2) + if errcode == 301 or errcode == 302: + redirect_depth = redirect_depth + 1 + if redirect_depth > config.REDIRECT_DEPTH: + debugio.write('\tToo many redirects!') + redirect_depth = 0 + return (errcode, errmsg, headers, url) + redirect = headers['location'] + redirect = urlparse.urljoin(url,redirect) + if redirect == url: + debugio.write('\tRedirect same as source: %s' % redirect) + redirect_depth = 0 + return (errcode, errmsg, headers, url) + debugio.write('\tRedirected to: ' + redirect) + if Link.linkList.has_key(redirect): + link = Link.linkList[redirect] + return (link.status, link.message, link.headers, link.URL) + return get_reply(redirect) + return (errcode, errmsg, headers, url) + +def init(self, url, parent): + """ Here, self is a reference of the link object that is calling this + pseudo-method""" + + (self.status, self.message, self.headers, self.URL) = get_reply(myUrlLib.basejoin(parent,url)) + Link.linkList[self.URL] = self + try: + self.type = self.headers.gettype() + except AttributeError: + self.type = 'text/html' # is this a good enough default? + + debugio.write('\tContent-type: ' + self.type,2) + try: + self.size = int(self.headers['content-length']) + except (KeyError, TypeError): + self.size = 0 + + if (self.status != 200) and (self.status != 'Not Checked'): + self.set_bad_link(self.URL,str(self.status) + ": " + self.message) + return + + try: + lastMod = time.mktime(self.headers.getdate('Last-Modified')) + except (OverflowError, TypeError, ValueError): + lastMod = None + if lastMod: + self.age = int((time.time()-lastMod)/myUrlLib.SECS_PER_DAY) + +def get_document(url): + document = opener.open(url).read() + opener.cleanup() + return document + +def parse_host(location): + """Return a tuple (user, password, host, port) + + takes string http://user:password@hostname:hostport and + returns a tuple. If a field is null in the string it will be + returned as None in the tuple. + """ + + #location = urlparse.urlparse(host)[1] + debugio.write("network location= %s" % location,3) + + at = string.find(location, "@") + if at > -1: + userpass = location[:at] + colon = string.find(userpass, ":") + if colon > -1: + user = userpass[:colon] + passw = userpass[colon+1:] + else: + user = userpass + passw = None + hostport = location[at+1:] + else: + user = passw = None + hostport = location + + colon = string.find(hostport, ":") + if colon > -1: + hostname = hostport[:colon] + port = hostport[colon+1:] + else: + hostname = hostport + port = None + + debugio.write("parse_host = %s %s %s %s" % (user, passw, hostname, port),3) + return (user, passw, hostname, port) + diff --git a/version.py b/version.py new file mode 100644 index 0000000..2c33a07 --- /dev/null +++ b/version.py @@ -0,0 +1,24 @@ +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +"""Contains version and other static information""" + +webcheck="1.0" +authors='Mike Meyer <mwm@mired.org>' +home='http://www.mired.org/webcheck/' +#registry='http://starship.python.net/crew/marduk/webcheck/registry' diff --git a/webcheck.css b/webcheck.css new file mode 100644 index 0000000..ba94323 --- /dev/null +++ b/webcheck.css @@ -0,0 +1,126 @@ +/* "Global" Settings */ +BODY { + background: #ffffff; + font-size: 10pt; +} + +A:link { + color: #0000cd; +} + +A.external:link { + font-style: italic; +} + +A:active { + color: #0000ff; +} + +A:visited { + color: #bc0000; +} + +TH { + background: #cccc99; + color: #000000; +} + +TD { + background: #eeeee0; + color: #000000; +} + +BODY.navbar { + background: + url(http://www.mired.org/webcheck/blackbar.png); +} +.navbar TH { + background: #000000 url(http://www.mired.org/webcheck/blackbar.png); + color: #ffffff; + font-family: arial, sans-serif; +} + +.highlight TH { + background: #ffffff; + color: #000000; +} + +.navbar A:link { + background: none; + color: #ffffff; +} + +.navbar A:active { + background: none; + color: #00ff00; +} + +.navbar A:visited { + background: none; + color: #ffffff; +} + +.navbar TH.home { + background: #cd0000; + color: #ffffff; +} + +P.logo { + text-align: center; +} + +H1 { + font-size: 1.5em; +} +H1.basename { + text-align: center; +} + +TH.title { + background: #cd0000; + color: #ffffff; + font-family: "comic sans ms", verdana, sans-serif; + font-size: 1.3em; + text-align: left; +} + +TR.link { + background: #cccc99; + color: #000000; +} + +TR.status { + background: #bbbbb0; + color: #000000; +} + +TR.parent { + background: #ddddd0; + color: #000000; +} + +/* time/age fields */ +TD.time { + b + background: #ddddd0; + color: #000000; + text-align: right; +} + +TD.blank { + background: #ffffff; +} + +DIV.table { + /* the only way I know of to align tables via CSS */ + text-align: center; +} + + +P.authorlist { + font-family: helvetica, sans-serif; + font-size: smaller; + font-weight: bold; + text-align: center; +} + diff --git a/webcheck.py b/webcheck.py new file mode 100755 index 0000000..f2d5fd1 --- /dev/null +++ b/webcheck.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python + +# Copyright (C) 1998,1999 marduk <marduk@python.net> +# Copyright (C) 2002 Mike Meyer <mwm@mired.org> + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +USAGE='webcheck [-abvq][-l url][-x url]... [-y url]... [-r depth][-o dir][-w sec][-d level] url [location]...' +PYTHON_VERSION=1.5 # not used right now +explored = [] +problem_db = {} +linkList = {} + +import sys +import time + + +start_time = time.ctime(time.time()) + +# importing the config.py file is a real problem if the user did not install +# the files EXACTLY the way I said to... or even using the frozen version is +# becoming a real bitch. I will just have to tell them right out how to fix it. +try: + sys.path = ['.'] + sys.path + import config +except ImportError: + sys.stdout.write('Please verify that PYTHONPATH knows where to find "config.py"\n') + sys.exit(1) + +import myUrlLib +Link=myUrlLib.Link + +# myUrlLib will be looking for a 'config' module. set it up here. +myUrlLib.config=config + +import debugio +debugio.DEBUG_LEVEL = config.DEBUG_LEVEL + +import version + +def parse_args(): + import getopt + global URL + try: + optlist, args = getopt.getopt(sys.argv[1:],'vl:x:y:ar:o:bw:d:q') + except getopt.error, reason: + print reason + print USAGE + sys.exit(1) + for flag,arg in optlist: + if flag=='-v': + print_version() + sys.exit(0) + elif flag=='-x': + config.EXCLUDED_URLS.append(arg) + elif flag=='-y': + config.YANKED_URLS.append(arg) + elif flag=='-a': + config.AVOID_EXTERNAL_LINKS=1 + elif flag=='-r': + config.REDIRECT_DEPTH=int(arg) + elif flag=='-o': + config.OUTPUT_DIR=arg + elif flag=='-b': + config.BASE_URLS_ONLY=1 + elif flag=='-w': + config.WAIT_BETWEEN_REQUESTS=int(arg) + elif flag=='-l': + config.LOGO_HREF=arg + elif flag=='-d': + debugio.DEBUG_LEVEL=int(arg) + elif flag=='-q': + debugio.DEBUG_LEVEL=0 + + if len(args)==0: + print USAGE + sys.exit(1) + else: URL = args[0] + config.HOSTS=args[1:] + +def print_version(): + """Print version information""" + import os + print " Webcheck: " + version.webcheck + print " Python: " + sys.version + print " OS: " + os.name + print + +def warn(): + """Warn the user that something has gone wrong.""" + print "*******************************************" + print "* *" + print "* Warning, Webcheck has found nothing to *" + print "* report for this site. If you feel this *" + print "* is in error, please contact *" + print "* %s. *" % version.author + print "* and specify the environment that caused *" + print "* this to occur. *" + print "* *" + print "* Webcheck %s *" % version.webcheck + print "* *" + print "*******************************************" + +# set up the pages +plugins = config.PLUGINS + +if __name__ == '__main__': + + parse_args() + config.OUTPUT_DIR=config.OUTPUT_DIR + '/' + + debugio.write('checking site....') + try: + Link.base = Link(URL,None) # this will take a while + except KeyboardInterrupt: + sys.stderr.write("Interrupted\n") + sys.exit(1) + debugio.write('done.') + if not hasattr(Link.base,"URL"): + warn() + sys.exit(1) + + linkList = Link.linkList + + # now we can write out the files + # start with the frame-description page + debugio.write('Generating reports...') + from plugins.rptlib import main_index, nav_bar + main_index() + nav_bar(plugins) + debugio.write('done.') + diff --git a/webcheck.sh b/webcheck.sh new file mode 100755 index 0000000..b472e87 --- /dev/null +++ b/webcheck.sh @@ -0,0 +1,4 @@ +#! /bin/sh +PYTHONPATH="/home/mwm/src/webcheck:$PYTHONPATH" +PATH="/usr/opt/bin:$PATH" +/home/mwm/src/webcheck/webcheck.py "$@" |