1 @INPROCEEDINGS{chatnoir,
\r
2 AUTHOR = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},
\r
3 BOOKTITLE = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)} { (to appear) },
\r
5 EDITOR = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},
\r
10 SITE = {Portland, Oregon},
\r
11 TITLE = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},
\r
15 @BOOK{text_patterns,
\r
16 author = "{Mike Scott and Christopher Tribble}",
\r
17 title = "{Textual Patterns, Key words and corpus analysis in language education}",
\r
19 publisher = "{John Benjamins Publishing Company}",
\r
25 title = "{Sketch Engine EnTenTen corpus}",
\r
26 howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}",
\r
30 @INPROCEEDINGS{Knight,
\r
31 author = {Allan Knight and Kevin Almeroth and Bruce Bimber},
\r
32 title = {An Automated System for Plagiarism Detection Using the Internet},
\r
33 booktitle = {Proceedings of World Conference on Educational Multimedia, Hypermedia and Telecommunications, pg. 3619-3625},
\r
38 @INPROCEEDINGS{Kasprzak2008,
\r
39 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c} and Pavel {\v S}merk",
\r
40 TITLE = "Distributed System for Discovering Similar Documents",
\r
41 SUBTITLE = "From a Relational Database to the Custom-Developed Parallel solution",
\r
42 BOOKTITLE = "ICEIS 2008: Proceedings of the Tenth International Conference on Enterprise Information Systems, Vol. DISI---Databases and Informations Systems Integration",
\r
44 publisher = "INSTICC (Institute for Systems and Technologies of Information, Control and Communication), Setúbal, Portugal",
\r
46 isbn = "978-989-8111-36-4",
\r
50 @INPROCEEDINGS{Kasprzak2009,
\r
51 AUTHOR = "Jan Kasprzak and Michal Brandejs and Jitka Brandejsová",
\r
52 TITLE = "Distributed Aspects of the System for Discovering Similar Documents",
\r
53 BOOKTITLE = "ITA 09: Proceedings of the Third International Conference on Internet Technology and Applications",
\r
58 @INPROCEEDINGS{Kasprzak2009a,
\r
59 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav Křipač",
\r
60 TITLE = "Finding Plagiarism by Evaluating Document Similarities",
\r
61 BOOKTITLE = "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing",
\r
66 @INPROCEEDINGS{Monostori2002,
\r
67 author = {Kriszti\'{a}n Monostori and Raphael A. Finkel and Arkady B. Zaslavsky and G\'{a}bor Hod\'{a}sz and M\'{a}t\'{e} Pataki},
\r
68 title = {Comparison of Overlap Detection Techniques},
\r
69 booktitle = {ICCS '02: Proceedings of the International Conference on Computational Science-Part I},
\r
71 isbn = {3-540-43591-3},
\r
73 publisher = {Springer-Verlag},
\r
74 address = {London, UK},
\r
79 title = "{Czech National Archive of Graduate Theses}",
\r
80 howpublished = "\url{http://theses.cz/}",
\r
81 year = "2008--2011",
\r
86 title = "{Masaryk University Information System}",
\r
87 howpublished = "\url{http://is.muni.cz/}",
\r
88 year = "1999--2011",
\r
92 key = "{Odevzdej.CZ}",
\r
93 title = "{Odevzdej---the system for collecting seminar works}",
\r
94 howpublished = "\url{http://odevzdej.cz/}",
\r
95 year = "2009--2011",
\r
99 @inproceedings{finkel2002,
\r
100 author = {Finkel, Raphael A. and Zaslavsky, Arkady and Monostori, Kriszti\'{a}n and Schmidt, Heinz},
\r
101 title = {Signature extraction for overlap detection in documents},
\r
102 booktitle = {ACSC '02: Proceedings of the twenty-fifth Australasian conference on Computer science},
\r
104 isbn = {0-909925-82-8},
\r
106 location = {Melbourne, Victoria, Australia},
\r
107 publisher = {Australian Computer Society, Inc.},
\r
108 address = {Darlinghurst, Australia},
\r
111 @INPROCEEDINGS{broder97,
\r
112 title={On the resemblance and containment of documents},
\r
113 author={Broder, A.Z.},
\r
114 booktitle={Compression and Complexity of Sequences 1997. Proceedings},
\r
120 keywords={information retrieval, random processes, set theoryRabin fingerprints, World Wide Web, containment, documents, fixed size sample, informal notions, information retrieval, intersection problems, mathematical notions, mathematical properties, random sampling, resemblance, roughly contained, roughly the same},
\r
121 doi={10.1109/SEQUEN.1997.666900},
\r
126 author="{Rivest, R.}",
\r
127 title="{RFC1321: The MD5 Message-Digest Algorithm}",
\r
129 publisher = {RFC Editor},
\r
130 address = {United States},
\r
131 note={\url{http://www.rfc-editor.org/rfc/rfc1321.txt}},
\r
134 @Misc{britannicaplagiarism,
\r
135 author = "Encyclop\ae{}dia Britannica",
\r
136 title = "Plagiarism",
\r
137 howpublished = "retrieved 2009--08--24 from \url{http://www.britannica.com/EBchecked/topic/462640/plagiarism}",
\r
142 author = "iDnes.CZ",
\r
143 title = "Zlínského děkana usvědčili z plagiátorství",
\r
144 howpublished = "retrieved 2009--08--25 from \url{http://zpravy.idnes.cz/studium.asp?c=A080709_085836_studium_bar}",
\r
148 @inproceedings{pomikalek2008,
\r
149 author = "Pomikálek, Jan and Rychlý, Pavel",
\r
150 title = "Detecting Co-Derivative Documents in Large Text Collections",
\r
151 booktitle = "Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)",
\r
154 address = "Marrakech, Morocco",
\r
155 url = "http://www.lrec-conf.org/lrec2008/"
\r
158 @inproceedings{pomikalek2009,
\r
159 author = "Pomikálek, Jan and Rychlý, Pavel and Kilgarriff, Adam",
\r
160 title = "Scaling to Billion-plus Word Corpora",
\r
161 booktitle = "Advances in Computational Linguistics",
\r
164 address = "Mexico",
\r
165 issn = "1870-4069",
\r
166 publisher = "Instituto Politécnico Nacional",
\r
171 title = "Masaryk University: Full-text search",
\r
172 howpublished = "retrieved 2009--08--25 from \url{http://www.muni.cz/general/search}",
\r
177 author = "Alan Cox",
\r
178 title = "{Alan Cox talks about laws... and Linux}",
\r
179 howpublished = "retrieved 2009--08--27 from \url{http://interviews.slashdot.org/article.pl?sid=02/05/20/1314214}",
\r
183 @inproceedings{coderivative,
\r
184 Author = {Bernstein, Y and Zobel, J},
\r
185 Title = {{A Scalable System for Identifying Co-derivative Documents}},
\r
186 Booktitle = {{String Processing and Information Retrieval, Proceedings}},
\r
187 Series = {{Lecture Notes in Computer Science}},
\r
191 Publisher = {{Springer-Verlag Berlin}},
\r
192 Type = {{Proceedings Paper}},
\r
193 Language = {{English}},
\r
194 Affiliation = {{Bernstein, Y (Reprint Author), RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.
\r
195 RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.}},
\r
196 ISSN = {{0302-9743}},
\r
197 ISBN = {{3-540-23210-9}},
\r
198 Keywords-Plus = {{COMPRESSION}},
\r
199 Subject-Category = {{Computer Science, Theory \& Methods}},
\r
200 Author-Email = {{ybernste@cs.rmit.edu.au
\r
201 jz@cs.rmit.edu.au}},
\r
202 Number-of-Cited-References = {{17}},
\r
203 Times-Cited = {{2}},
\r
204 Doc-Delivery-Number = {{BBA15}},
\r
205 Unique-ID = {{ISI:000224377200006}},
\r
209 key = "{Turnitin.com}",
\r
210 title = "Turnitin",
\r
211 howpublished = "\url{http://turnitin.com/}, retrieved 2009--08--26",
\r
216 key = "{Copyscape.com}",
\r
217 title = "Copyscape",
\r
218 howpublished = "\url{http://copyscape.com/}, retrieved 2009--08--26",
\r
223 key = "{Doccop.com}",
\r
225 howpublished = "\url{http://doccop.com/}, retrieved 2009--08--26",
\r
229 @MISC{pan09competition,
\r
231 title = "1st International Competition on Plagiarism Detection",
\r
232 howpublished = "\url{http://www.uni-weimar.de/medien/webis/research/workshopseries/pan-09/competition.html}, retrieved 2009--08--26",
\r
236 @INPROCEEDINGS{Brin95copydetection,
\r
237 author = {Sergey Brin and James Davis and Hector Garcia-Molina},
\r
238 title = {Copy Detection Mechanisms for Digital Documents},
\r
239 booktitle = {Proceedings of the ACM SIGMOD Annual Conference},
\r
244 @INPROCEEDINGS{Shivakumar95scam,
\r
245 author = {Narayanan Shivakumar and Hector Garcia-Molina},
\r
246 title = {SCAM: A Copy Detection Mechanism for Digital Documents},
\r
247 booktitle = {Proceedings of the Second Annual Conference on the Theory and Practice of Digital Libraries},
\r
251 @INPROCEEDINGS{Garcia-Molina96dscam:finding,
\r
252 author = {Hector Garcia-Molina and Luis Gravano and Narayanan Shivakumar},
\r
253 title = {dSCAM: Finding Document Copies across Multiple Databases},
\r
254 booktitle = {In Proceedings of the 4th International Conference on Parallel and Distributed Information Systems},
\r
258 @INPROCEEDINGS{LinuxDesktop,
\r
259 author = {Jan Kasprzak},
\r
260 title = "{Desktop a jádro Linuxu}",
\r
261 booktitle = {Proceedings of the XXXI EurOpen.CZ Conference},
\r
262 isbn = "978-80-86583-13-6",
\r
264 publisher = "EurOpen.CZ, Plzeň",
\r
268 @INPROCEEDINGS{Filesystems,
\r
269 author = {Jan Kasprzak},
\r
270 title = "{Co umí souborové systémy}",
\r
271 booktitle = {Proceedings of the XXXII EurOpen.CZ Conference},
\r
272 isbn = "978-80-86583-14-3",
\r
273 pages = {105--118},
\r
274 publisher = "EurOpen.CZ, Plzeň",
\r
278 @INPROCEEDINGS{GitEuropen,
\r
279 author = {Jan Kasprzak},
\r
280 title = "{Git aneb správa verzí trochu jinak}",
\r
281 booktitle = {Proceedings of the XXXIV EurOpen.CZ Conference},
\r
282 isbn = "978-80-86583-16-7",
\r
283 pages = {107--118},
\r
284 publisher = "EurOpen.CZ, Plzeň",
\r
288 @INPROCEEDINGS{Clusters,
\r
289 author = {Jan Kasprzak},
\r
290 title = "{Clusterová řešení pod Linuxem}",
\r
291 booktitle = {SLT 2001: Proceedings of the 2nd Seminar on Linux and \TeX},
\r
292 isbn = "80-7302-009-2",
\r
293 pages = {161--168},
\r
294 publisher = "Konvoj, Brno",
\r
299 AUTHOR = {{Webis at Bauhaus-Universität Weimar} and
\r
300 {NLEL at Universidad Polytécnica de Valencia}},
\r
301 HOWPUBLISHED = {\url{http://www.webis.de/research/corpora}},
\r
302 TITLE = {{PAN Plagiarism Corpus 2009 (PAN-PC-09)}},
\r
304 NOTE = {{Martin Potthast, Andreas Eiselt, Benno Stein,
\r
305 Alberto Barrón-Cedeño, and Paolo Rosso (editors)}}
\r
308 @INPROCEEDINGS{ngram,
\r
309 author = {William B. Cavnar and John M. Trenkle},
\r
310 title = {N-Gram-Based Text Categorization},
\r
311 booktitle = {In Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval},
\r
316 @inproceedings{intrinsic,
\r
317 author = {Sven Meyer zu Eissen and Benno Stein},
\r
318 booktitle = {ECIR},
\r
319 editor = {Mounia Lalmas and Andy MacFarlane and Stefan M. Rüger and Anastasios Tombros and Theodora Tsikrika and Alexei Yavlinsky},
\r
321 publisher = {Springer},
\r
322 series = {Lecture Notes in Computer Science},
\r
323 title = {Intrinsic Plagiarism Detection.},
\r
324 url = {\url{http://dblp.uni-trier.de/db/conf/ecir/ecir2006.html#EissenS06}},
\r
327 ee = {http://dx.doi.org/10.1007/11735106_66},
\r
328 isbn = {3-540-33347-9},
\r
329 date = {2006-04-03}
\r
332 @INPROCEEDINGS{Heintze96scalabledocument,
\r
333 author = {Nevin Heintze},
\r
334 title = {Scalable Document Fingerprinting},
\r
335 booktitle = {In Proc. USENIX Workshop on Electronic Commerce},
\r
339 @inproceedings{suffixtree,
\r
340 author = {Manber, Udi and Myers, Gene},
\r
341 title = {Suffix arrays: a new method for on-line string searches},
\r
342 booktitle = {SODA '90: Proceedings of the first annual ACM-SIAM symposium on Discrete algorithms},
\r
344 isbn = {0-89871-251-3},
\r
345 pages = {319--327},
\r
346 location = {San Francisco, California, United States},
\r
347 publisher = {Society for Industrial and Applied Mathematics},
\r
348 address = {Philadelphia, PA, USA}
\r
354 AUTHOR = "Andrew Tridgell and Paul Mackerras",
\r
355 TITLE = "The rsync algorithm",
\r
356 DATE = "2004--05--19",
\r
358 INSTITUTION = "Department of Computer Science, FEIT, Australian National university",
\r
360 NOTE = "\url{http://hdl.handle.net/1885/40765}",
\r
365 AUTHOR = "Linus Torvalds et al",
\r
366 TITLE = "{Git---the Fast Version Control System}",
\r
367 HOWPUBLISHED = "\url{http://git-scm.com/}, retrieved 2011--01--12",
\r
372 TITLE = "{The RPM Package Manager}",
\r
373 HOWPUBLISHED = "\url{http://www.rpm.org/}, retrieved 2011-01--12",
\r
378 TITLE = "{DeltaRPM}",
\r
379 HOWPUBLISHED = "\url{ftp://ftp.suse.com/pub/projects/deltarpm/}, retrieved 2011--01--12",
\r
382 @mastersthesis{zazrivec,
\r
383 AUTHOR = "Milan Zázrivec",
\r
384 TITLE = "Algoritmus a implementace software pro tvorbu binárních záplat",
\r
385 SCHOOL = "Faculty of Informatics, Masaryk University",
\r
386 NOTE = "\url{http://is.muni.cz/th/60716/fi_m/}",
\r
391 KEY = "Google Scholar",
\r
392 TITLE = "{Google Scholar}",
\r
393 HOWPUBLISHED = "\url{http://scholar.google.com/}, retrieved 2011--01--12",
\r
398 TITLE = "{American National Standard for Information Systems -- Coded Character Sets -- 7-Bit American National Standard Code for Information Interchange (7-Bit ASCII), ANSI X3.4-1986}",
\r
399 DATE="1986--03--26",
\r
401 INSTITUTION = "American National Standards Institute, Inc.",
\r
405 AUTHOR = "The Unicode Consortium",
\r
406 TITLE = "{The Unicode Standard -- Version 4.0}",
\r
407 PUBLISHER = "Addison-Wesley, Boston, MA",
\r
409 ISBN = "0--321--18578--1",
\r
410 NOTE = "\url{http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_4_0_0}",
\r
415 TITLE = "{RFC 3629: UTF-8, a transformation format of ISO 10646}",
\r
416 AUTHOR = "F. Yergeau",
\r
418 NOTE = "\url{http://tools.ietf.org/html/rfc3629}",
\r
421 @MISC{thesisproposal,
\r
422 TITLE = "{Systems for Discovering Similar Documents}",
\r
423 AUTHOR = "Jan Kasprzak",
\r
424 INSTITUTION = "Faculty of Informatics, Masaryk University",
\r
426 NOTE = "Ph.D. thesis proposal, \url{http://is.muni.cz/th/1885/fi_r/}",
\r
429 @inproceedings{Kasprzak2010,
\r
430 title={Improving the reliability of the plagiarism detection system},
\r
431 author={Kasprzak, J. and Brandejs, M.},
\r
432 booktitle={Notebook Papers of CLEF 2010 LABs and Workshops},
\r
434 organization={Citeseer}
\r
437 @article{stamatatos2011plagiarism,
\r
438 title={Plagiarism detection using stopword n-grams},
\r
439 author={Stamatatos, E.},
\r
440 journal={Journal of the American Society for Information Science and Technology},
\r
442 publisher={Wiley Online Library}
\r
444 @inproceedings{pan09stamatatos,
\r
445 author = {Efstathios Stamatatos},
\r
447 title = {Intrinsic Plagiarism Detection Using Character n-gram Profiles},
\r
448 booktitle = {Proceedings of the SEPLN'09 Workshop on Uncovering Plagiarism, Authorship and Social Software Misuse},
\r
450 location = {San Sebastian (Donostia), Spain},
\r
451 issn = {1613--0073},
\r
455 @article{zipf1935psycho,
\r
456 title={The psycho-biology of language.},
\r
457 author={Zipf, G.K.},
\r
459 publisher={Houghton, Mifflin}
\r
462 @INPROCEEDINGS{potthastframework,
\r
463 TITLE = {{An Evaluation Framework for Plagiarism Detection}
\r
465 AUTHOR = {Martin Potthast and Benno Stein and Alberot Barr{\'o}n-Cede{\~n}o and Paolo Rosso},
\r
466 BOOKTITLE = {Proceedings of the 23rd International Conference on Computational Linguistics (COLING 2010) (to appear)},
\r
469 ADDRESS = {Beijing, China},
\r
470 PUBLISHER = {Association for Computational Linguistics},
\r