1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package gov.nasa.pds.tools.file;
16
17 import java.io.File;
18 import java.io.FileFilter;
19 import java.io.IOException;
20 import java.io.InputStreamReader;
21 import java.net.MalformedURLException;
22 import java.net.URL;
23 import java.util.Arrays;
24 import java.util.Iterator;
25 import java.util.LinkedHashSet;
26 import java.util.List;
27 import java.util.ArrayList;
28 import java.util.Set;
29
30 import javax.swing.text.BadLocationException;
31 import javax.swing.text.EditorKit;
32 import javax.swing.text.SimpleAttributeSet;
33 import javax.swing.text.html.HTML;
34 import javax.swing.text.html.HTMLDocument;
35 import javax.swing.text.html.HTMLEditorKit;
36
37 import org.apache.commons.io.filefilter.FileFilterUtils;
38 import org.apache.commons.io.filefilter.IOFileFilter;
39 import org.apache.commons.io.filefilter.NotFileFilter;
40 import org.apache.commons.io.FileUtils;
41 import org.apache.commons.io.FilenameUtils;
42
43 import gov.nasa.pds.tools.file.filefilter.WildcardOSFilter;
44
45 /***
46 * Class that can generate a list of files from a supplied directory and optionally, a specified
47 * filter. The resulting files and directories are stored in a FileList.
48 *
49 * @author mcayanan
50 * @version $Revision $
51 */
52 public class FileListGenerator {
53
54 private NotFileFilter noFileFilter;
55 private IOFileFilter noDirFilter;
56 private IOFileFilter fileFilter;
57 private IOFileFilter effFileFilter;
58 private FileFilter effDirFilter;
59 private final int fileExt = 3;
60
61 /***
62 * Default constructor
63 *
64 */
65 public FileListGenerator() {
66 fileFilter = new WildcardOSFilter("*");
67 noFileFilter = null;
68 noDirFilter = null;
69 }
70
71 /***
72 * Sets the filter to be used when searching for files in a directory
73 * @param wildcards a list of files and/or file patterns to match
74 */
75 private void setFileFilter(List wildcards) {
76 fileFilter = new WildcardOSFilter(wildcards);
77 }
78
79 /***
80 * Sets the filter to be used when searching for files to ignore in a directory
81 * @param wildcards a list of files and/or file patterns to ignore
82 */
83 private void setNoFileFilter(List wildcards) {
84 noFileFilter = new NotFileFilter(new WildcardOSFilter(wildcards));
85 }
86
87 /***
88 * Sets the filter to be used when searching for directories to ignore
89 * @param patterns a list of directory/directory patterns to ignore
90 */
91 private void setNoDirFilter(List patterns) {
92 noDirFilter = new NotFileFilter(new WildcardOSFilter(patterns));
93 }
94
95 /***
96 * Combines filters to include and exclude files using the AND file filter
97 *
98 */
99 private void setEffFileFilter() {
100 if(noFileFilter != null)
101 effFileFilter = FileFilterUtils.andFileFilter(fileFilter, noFileFilter);
102 else
103 effFileFilter = fileFilter;
104 }
105
106 /***
107 * Combines filters to seek out directories to exclude using the AND file filter
108 *
109 */
110 private void setEffDirFilter() {
111 if(noDirFilter != null)
112 effDirFilter = FileFilterUtils.andFileFilter(noDirFilter, FileFilterUtils.directoryFileFilter());
113 else
114 effDirFilter = FileFilterUtils.directoryFileFilter();
115 }
116
117 /***
118 * Sets all possible filters when looking in a directory.
119 *
120 * @param regexp File patterns to include when finding files in a directory
121 * @param noFiles File patterns to ignore when finding files in a directory
122 * @param noDirs Directory patterns to ignore when finding sub-directories
123 */
124 public void setFilters(List regexp, List noFiles, List noDirs) {
125 if(regexp != null)
126 setFileFilter(regexp);
127 if(noFiles != null)
128 setNoFileFilter(noFiles);
129 if(noDirs != null)
130 setNoDirFilter(noDirs);
131
132 setEffFileFilter();
133 setEffDirFilter();
134 }
135
136 /***
137 * Allows one to pass in a file or URL. Directories will be visited if the
138 * target is a directory. The resulting list is stored in a FileList object.
139 *
140 * @param getSubDirs 'true' to look for sub-directories, 'false' to just search for files when
141 * given a directory as input
142 * @return A FileList object that contains the files and sub-directories
143 * @throws BadLocationException
144 * @throws IOException
145 */
146 public FileList visitTarget(String target, boolean getSubDirs) throws IOException, BadLocationException {
147 File file = null;
148 FileList fileList = new FileList();
149
150 try {
151 URL url = new URL(target.toString());
152 if((file = FileUtils.toFile(url)) != null) {
153 return (visitFileTarget(file, getSubDirs));
154 }
155 if(!isLinkFile(target.toString()))
156 fileList = crawl(new URL(target.toString()), getSubDirs);
157 else
158 fileList.addToFileList(url);
159 }
160 catch(MalformedURLException uEx) {
161 return (visitFileTarget(new File(target), getSubDirs));
162 }
163 return fileList;
164 }
165
166 /***
167 * Visits the file being supplied. If a directory is being passed in, then it
168 * will look for files and sub-directories (if it is turned ON). Otherwise, it
169 * simply gets stored in the FileList.
170 *
171 * @param file A file or a directory. If it is a directory, then the visitDir method
172 * will be called and the list of files can be retrieved via the getFiles and
173 * getSubDirs methods.
174 * @param getSubDirs Tells the method whether to look for sub-directories
175 * @return a FileList object
176 * @throws IOException
177 */
178 private FileList visitFileTarget(File file, boolean getSubDirs) throws IOException {
179 FileList fileList = new FileList();
180 if(file.isDirectory())
181 fileList = visitDir(file, getSubDirs);
182 else
183 fileList.addToFileList(file);
184
185 return fileList;
186 }
187
188 /***
189 * Gets a list of files under a given directory.
190 *
191 * Filters must be set via setFileFilters prior to calling this method in order to look
192 * for specific files and filter out un-wanted files and sub-drirectories.
193 *
194 * @param dir the name of the directory
195 * @param getSubDirs 'true' to get a list of sub-directories
196 * @return A FileList object containing the files and sub-directories found
197 * @throws IOException
198 */
199 public FileList visitDir(File dir, boolean getSubDirs) throws IOException {
200 FileList fileList = new FileList();
201
202 if( !dir.isDirectory() )
203 throw new IllegalArgumentException("parameter 'dir' is not a directory: " + dir);
204
205
206 fileList.addToFileList(FileUtils.listFiles(dir, effFileFilter, null));
207
208
209 if(getSubDirs)
210 fileList.addToDirList(Arrays.asList(dir.listFiles(effDirFilter)));
211
212 return fileList;
213 }
214
215 /***
216 * Crawls a directory URL, looking for files and sub-directories. Files found in a URL
217 * are assumed to end with a ".xxx".
218 *
219 * Filters must be set via the setFileFilters method prior to crawling in order to look for
220 * files and filter out un-wanted files and directories.
221 *
222 * @param url The URL to crawl
223 * @param getSubDirURLs Set to 'true' to retrieve sub-directory URLs, 'false' otherwise
224 * @return A FileList object containing the files and sub-directories that were found.
225 * @throws IOException
226 * @throws BadLocationException
227 */
228 public FileList crawl(URL url, boolean getSubDirURLs) throws IOException, BadLocationException {
229 Set links = new LinkedHashSet();
230 FileList fileList = new FileList();
231
232 links.addAll(getHyperLinks(url));
233 fileList.addToFileList(getFileURLNames(url, links));
234
235 if(getSubDirURLs) {
236 fileList.addToDirList(getSubDirURLNames(url, links));
237 }
238
239 return fileList;
240 }
241
242 /***
243 * Gets hyperlinks found in an HTML document of a URL. No duplicate links will be returned.
244 *
245 * @param url location
246 * @return A Set of hyperlinks
247 *
248 * @throws IOException
249 * @throws BadLocationException
250 * @throws NullPointerException
251 */
252 public Set getHyperLinks(URL url) throws IOException, BadLocationException, NullPointerException {
253 InputStreamReader stream = null;
254 HTMLDocument doc = null;
255 EditorKit kit = null;
256 Set links = new LinkedHashSet();
257
258 try {
259 stream = new InputStreamReader(url.openStream());
260 kit = new HTMLEditorKit();
261 doc = (HTMLDocument) kit.createDefaultDocument();
262 kit.read(stream, doc, 0);
263 }
264 finally {
265 stream.close();
266 }
267
268 for(HTMLDocument.Iterator i = doc.getIterator(HTML.Tag.A); i.isValid(); i.next()) {
269 SimpleAttributeSet s = (SimpleAttributeSet) i.getAttributes();
270 links.add((String) s.getAttribute(HTML.Attribute.HREF));
271 }
272 return links;
273 }
274
275 /***
276 * Finds links to files. This assumes that a file must end in a ".xxx", otherwise
277 * it will not be retrieved.
278 *
279 * @param url The location
280 * @param links The Set of files and directories found inside the URL
281 * @return a list of file URLs
282 * @throws MalformedURLException
283 */
284 public List getFileURLNames(URL url, Set links) throws MalformedURLException {
285 List fileURLs = new ArrayList();
286 String parent = url.toString();
287
288 if(parent.endsWith("/") == false)
289 parent = parent.concat("/");
290
291 for(Iterator i = links.iterator(); i.hasNext();) {
292 String link = i.next().toString();
293 if(isLinkFile(link)) {
294 if(effFileFilter.accept(new File(link)) == true)
295 fileURLs.add(new URL(parent.concat(link)));
296 }
297 }
298 return fileURLs;
299 }
300
301 /***
302 * Finds links to sub-directory URLs
303 *
304 * @param url The location
305 * @param links The Set of files and directories found inside the URL
306 * @return a list of sub directory URLs
307 * @throws MalformedURLException
308 */
309 public List getSubDirURLNames(URL url, Set links) throws MalformedURLException {
310 List dirURLs = new ArrayList();
311 String parent = url.toString();
312
313 if(parent.endsWith("/") == false)
314 parent = parent.concat("/");
315
316 for(Iterator i = links.iterator(); i.hasNext();) {
317 String link = i.next().toString();
318 if(isLinkSubDir(url, link)) {
319 if(noDirFilter == null)
320 dirURLs.add(new URL(parent.concat(link)));
321 else if( (noDirFilter != null) && (noDirFilter.accept(new File(link)) == true) )
322 dirURLs.add(new URL(parent.concat(link)));
323 }
324 }
325 return dirURLs;
326 }
327
328 /***
329 * Determines if a hyperlink is a file. The rule is that if the name ends with a ".xxx",
330 * then it is a file. Otherwise, false is returned.
331 *
332 * @param link The hyperlink name to examine
333 * @return 'true' if hyperlink contains a 3 character file extension, 'false' otherwise
334 */
335 public boolean isLinkFile(String link) {
336 String ext = FilenameUtils.getExtension(link);
337 if(ext.length() == fileExt) {
338 return true;
339 }
340 else
341 return false;
342
343 }
344
345 /***
346 * Determines if a hyperlink is a sub-directory.
347 *
348 * @param url The location
349 * @param link The hyperlink name to examine
350 * @return 'true' if hyperlink is a sub-directory, 'false' otherwise
351 */
352 public boolean isLinkSubDir(URL url, String link) {
353 if( !isLinkFile(link) && link.indexOf('#') == -1 && link.indexOf('?') == -1) {
354
355 String parent = new File(url.getFile()).getParent();
356 if( parent.equalsIgnoreCase(new File(link).toString()) )
357 return false;
358 else
359 return true;
360 }
361 else
362 return false;
363
364 }
365
366 }