View Javadoc

1   //Copyright 2006-2007, by the California Institute of Technology.
2   //ALL RIGHTS RESERVED. United States Government Sponsorship acknowledged.
3   //Any commercial use must be negotiated with the Office of Technology Transfer
4   //at the California Institute of Technology.
5   //
6   //This software is subject to U. S. export control laws and regulations
7   //(22 C.F.R. 120-130 and 15 C.F.R. 730-774). To the extent that the software
8   //is subject to U.S. export control laws and regulations, the recipient has
9   //the responsibility to obtain export licenses or other export authority as
10  //may be required before exporting such information to foreign countries or
11  //providing access to foreign nationals.
12  
13  // $Id: FileListGenerator.java 3461 2008-08-07 17:43:26Z pramirez $ 
14  
15  package gov.nasa.pds.tools.file;
16  
17  import java.io.File;
18  import java.io.FileFilter;
19  import java.io.IOException;
20  import java.io.InputStreamReader;
21  import java.net.MalformedURLException;
22  import java.net.URL;
23  import java.util.Arrays;
24  import java.util.Iterator;
25  import java.util.LinkedHashSet;
26  import java.util.List;
27  import java.util.ArrayList;
28  import java.util.Set;
29  
30  import javax.swing.text.BadLocationException;
31  import javax.swing.text.EditorKit;
32  import javax.swing.text.SimpleAttributeSet;
33  import javax.swing.text.html.HTML;
34  import javax.swing.text.html.HTMLDocument;
35  import javax.swing.text.html.HTMLEditorKit;
36  
37  import org.apache.commons.io.filefilter.FileFilterUtils;
38  import org.apache.commons.io.filefilter.IOFileFilter;
39  import org.apache.commons.io.filefilter.NotFileFilter;
40  import org.apache.commons.io.FileUtils;
41  import org.apache.commons.io.FilenameUtils;
42  
43  import gov.nasa.pds.tools.file.filefilter.WildcardOSFilter;
44  
45  /***
46   * Class that can generate a list of files from a supplied directory and optionally, a specified
47   * filter. The resulting files and directories are stored in a FileList.
48   * 
49   * @author mcayanan
50   * @version $Revision $
51   */
52  public class FileListGenerator {
53  
54  	private NotFileFilter noFileFilter;
55  	private IOFileFilter noDirFilter;
56  	private IOFileFilter fileFilter;
57  	private IOFileFilter effFileFilter;
58  	private FileFilter effDirFilter;
59  	private final int fileExt = 3;
60  	
61  	/***
62  	 * Default constructor
63  	 *
64  	 */
65  	public FileListGenerator() {
66  		fileFilter = new WildcardOSFilter("*");
67  		noFileFilter = null;
68  		noDirFilter = null;
69  	}
70  	
71  	/***
72  	 * Sets the filter to be used when searching for files in a directory
73  	 * @param wildcards a list of files and/or file patterns to match
74  	 */
75  	private void setFileFilter(List wildcards) {
76  		fileFilter = new WildcardOSFilter(wildcards);
77  	}
78  	
79  	/***
80  	 * Sets the filter to be used when searching for files to ignore in a directory
81  	 * @param wildcards a list of files and/or file patterns to ignore
82  	 */
83  	private void setNoFileFilter(List wildcards) {
84  		noFileFilter = new NotFileFilter(new WildcardOSFilter(wildcards));
85  	}
86  	
87  	/***
88  	 * Sets the filter to be used when searching for directories to ignore
89  	 * @param patterns a list of directory/directory patterns to ignore
90  	 */
91  	private void setNoDirFilter(List patterns) {
92  		noDirFilter = new NotFileFilter(new WildcardOSFilter(patterns));
93  	}
94  	
95  	/***
96  	 * Combines filters to include and exclude files using the AND file filter
97  	 *
98  	 */
99  	private void setEffFileFilter() {
100 		if(noFileFilter != null)
101 			effFileFilter = FileFilterUtils.andFileFilter(fileFilter, noFileFilter);
102 		else
103 			effFileFilter = fileFilter;
104 	}
105 	
106 	/***
107 	 * Combines filters to seek out directories to exclude using the AND file filter
108 	 *
109 	 */
110 	private void setEffDirFilter() {
111 		if(noDirFilter != null)
112 			effDirFilter = FileFilterUtils.andFileFilter(noDirFilter, FileFilterUtils.directoryFileFilter());
113 		else
114 			effDirFilter = FileFilterUtils.directoryFileFilter();
115 	}
116 	
117 	/***
118 	 * Sets all possible filters when looking in a directory.
119 	 * 
120 	 * @param regexp File patterns to include when finding files in a directory
121 	 * @param noFiles File patterns to ignore when finding files in a directory
122 	 * @param noDirs Directory patterns to ignore when finding sub-directories
123 	 */
124 	public void setFilters(List regexp, List noFiles, List noDirs) {
125 		if(regexp != null)
126 			setFileFilter(regexp);
127 		if(noFiles != null)
128 			setNoFileFilter(noFiles);
129 		if(noDirs != null)
130 			setNoDirFilter(noDirs);
131 		
132 		setEffFileFilter();
133 		setEffDirFilter();
134 	}
135 	
136 	/***
137 	 * Allows one to pass in a file or URL. Directories will be visited if the
138 	 * target is a directory. The resulting list is stored in a FileList object.
139 	 * 
140 	 * @param getSubDirs 'true' to look for sub-directories, 'false' to just search for files when
141 	 *                   given a directory as input
142 	 * @return A FileList object that contains the files and sub-directories
143 	 * @throws BadLocationException 
144 	 * @throws IOException 
145 	 */
146 	public FileList visitTarget(String target, boolean getSubDirs) throws IOException, BadLocationException {
147 		File file = null;
148 		FileList fileList = new FileList();
149 		
150 		try {
151 			URL url = new URL(target.toString());
152 			if((file = FileUtils.toFile(url)) != null) {
153 				return (visitFileTarget(file, getSubDirs));			
154 			}
155 			if(!isLinkFile(target.toString()))
156 				fileList = crawl(new URL(target.toString()), getSubDirs);
157 			else
158 				fileList.addToFileList(url);
159 		}
160 		catch(MalformedURLException uEx) {
161 			return (visitFileTarget(new File(target), getSubDirs));
162 		}
163 		return fileList;
164 	}
165 	
166 	/***
167 	 * Visits the file being supplied. If a directory is being passed in, then it
168 	 * will look for files and sub-directories (if it is turned ON). Otherwise, it
169 	 * simply gets stored in the FileList. 
170 	 * 
171 	 * @param file A file or a directory. If it is a directory, then the visitDir method
172 	 *      will be called and the list of files can be retrieved via the getFiles and
173 	 *      getSubDirs methods.
174 	 * @param getSubDirs Tells the method whether to look for sub-directories
175 	 * @return a FileList object
176 	 * @throws IOException
177 	 */
178 	private FileList visitFileTarget(File file, boolean getSubDirs) throws IOException {
179 		FileList fileList = new FileList();		
180 		if(file.isDirectory())
181 			fileList = visitDir(file, getSubDirs);
182 		else
183 			fileList.addToFileList(file);
184 		
185 		return fileList;
186 	}
187 	
188 	/***
189 	 * Gets a list of files under a given directory.
190 	 * 
191 	 * Filters must be set via setFileFilters prior to calling this method in order to look
192 	 * for specific files and filter out un-wanted files and sub-drirectories.
193 	 * 
194 	 * @param dir the name of the directory
195 	 * @param getSubDirs 'true' to get a list of sub-directories
196 	 * @return A FileList object containing the files and sub-directories found
197 	 * @throws IOException 
198 	 */
199 	public FileList visitDir(File dir, boolean getSubDirs) throws IOException {
200 		FileList fileList = new FileList();
201 		
202 		if( !dir.isDirectory() )
203 			throw new IllegalArgumentException("parameter 'dir' is not a directory: " + dir);
204 		
205 		//Find files only first
206 		fileList.addToFileList(FileUtils.listFiles(dir, effFileFilter, null));
207 		
208 		//Visit sub-directories if the recurse flag is set
209 		if(getSubDirs)
210 			fileList.addToDirList(Arrays.asList(dir.listFiles(effDirFilter)));
211 	
212 		return fileList;
213 	}
214 		
215 	/***
216 	 * Crawls a directory URL, looking for files and sub-directories. Files found in a URL
217 	 * are assumed to end with a ".xxx".
218 	 * 
219 	 * Filters must be set via the setFileFilters method prior to crawling in order to look for
220 	 * files and filter out un-wanted files and directories.
221 	 * 
222 	 * @param url The URL to crawl
223 	 * @param getSubDirURLs Set to 'true' to retrieve sub-directory URLs, 'false' otherwise
224 	 * @return A FileList object containing the files and sub-directories that were found.
225 	 * @throws IOException
226 	 * @throws BadLocationException
227 	 */
228 	public FileList crawl(URL url, boolean getSubDirURLs) throws IOException, BadLocationException {
229 		Set links = new LinkedHashSet();
230 		FileList fileList = new FileList();
231 		
232 		links.addAll(getHyperLinks(url));
233 		fileList.addToFileList(getFileURLNames(url, links));
234 		
235 		if(getSubDirURLs) {
236 			fileList.addToDirList(getSubDirURLNames(url, links));
237 		}
238 		
239 		return fileList;
240 	}
241 	
242 	/***
243 	 * Gets hyperlinks found in an HTML document of a URL. No duplicate links will be returned.
244 	 * 
245 	 * @param url location
246 	 * @return A Set of hyperlinks
247 	 * 
248 	 * @throws IOException
249 	 * @throws BadLocationException 
250 	 * @throws NullPointerException
251 	 */
252 	public Set getHyperLinks(URL url) throws IOException, BadLocationException, NullPointerException {
253 		InputStreamReader stream = null;
254 		HTMLDocument doc = null;
255 		EditorKit kit = null;
256 		Set links = new LinkedHashSet();
257 		
258 		try {
259 			stream = new InputStreamReader(url.openStream());
260 			kit = new HTMLEditorKit();
261 			doc = (HTMLDocument) kit.createDefaultDocument();
262 			kit.read(stream, doc, 0);
263 		}
264 		finally {
265 			stream.close();
266 		}
267 
268 		for(HTMLDocument.Iterator i = doc.getIterator(HTML.Tag.A); i.isValid(); i.next()) {
269 			SimpleAttributeSet s = (SimpleAttributeSet) i.getAttributes();
270 			links.add((String) s.getAttribute(HTML.Attribute.HREF));
271 		}
272 		return links;
273 	}
274 	
275 	/***
276 	 * Finds links to files. This assumes that a file must end in a ".xxx", otherwise
277 	 * it will not be retrieved.
278 	 * 
279 	 * @param url The location
280 	 * @param links The Set of files and directories found inside the URL
281 	 * @return a list of file URLs
282 	 * @throws MalformedURLException
283 	 */
284 	public List getFileURLNames(URL url, Set links) throws MalformedURLException {
285 		List fileURLs = new ArrayList();
286 		String parent = url.toString();
287 		
288 		if(parent.endsWith("/") == false)
289 			parent = parent.concat("/");
290 		
291 		for(Iterator i = links.iterator(); i.hasNext();) {
292 			String link = i.next().toString();
293 			if(isLinkFile(link)) {
294 				if(effFileFilter.accept(new File(link)) == true)
295 					fileURLs.add(new URL(parent.concat(link)));
296 			}
297 		}
298 		return fileURLs;
299 	}
300 	
301 	/***
302 	 * Finds links to sub-directory URLs
303 	 * 
304 	 * @param url The location
305 	 * @param links The Set of files and directories found inside the URL
306 	 * @return a list of sub directory URLs
307 	 * @throws MalformedURLException
308 	 */
309 	public List getSubDirURLNames(URL url, Set links) throws MalformedURLException {
310 		List dirURLs = new ArrayList();
311 		String parent = url.toString();
312 		
313 		if(parent.endsWith("/") == false)
314 			parent = parent.concat("/");
315 		
316 		for(Iterator i = links.iterator(); i.hasNext();) {
317 			String link = i.next().toString();
318 			if(isLinkSubDir(url, link)) {
319 				if(noDirFilter == null)
320 					dirURLs.add(new URL(parent.concat(link)));
321 				else if( (noDirFilter != null) && (noDirFilter.accept(new File(link)) == true) )
322 					dirURLs.add(new URL(parent.concat(link)));
323 			}
324 		}
325 		return dirURLs;
326 	}
327 	
328 	/***
329 	 * Determines if a hyperlink is a file. The rule is that if the name ends with a ".xxx",
330 	 * then it is a file. Otherwise, false is returned.
331 	 * 
332 	 * @param link The hyperlink name to examine
333 	 * @return 'true' if hyperlink contains a 3 character file extension, 'false' otherwise
334 	 */
335 	public boolean isLinkFile(String link) {
336 		String ext = FilenameUtils.getExtension(link);
337 		if(ext.length() == fileExt) {
338 			return true;
339 		}
340 		else
341 			return false;
342 			
343 	}
344 	
345 	/***
346 	 * Determines if a hyperlink is a sub-directory.
347 	 * 
348 	 * @param url The location
349 	 * @param link The hyperlink name to examine
350 	 * @return 'true' if hyperlink is a sub-directory, 'false' otherwise
351 	 */
352 	public boolean isLinkSubDir(URL url, String link) {
353 		if( !isLinkFile(link) && link.indexOf('#') == -1 && link.indexOf('?') == -1) {
354 			//Check to see if the directory link is a hyperlink to the parent
355 			String parent = new File(url.getFile()).getParent();
356 			if( parent.equalsIgnoreCase(new File(link).toString()) )
357 				return false;
358 			else
359 				return true;
360 		}
361 		else
362 			return false;
363 			
364 	}
365 	
366 }