@INPROCEEDINGS{chatnoir, AUTHOR = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch}, BOOKTITLE = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)}, DOI = {}, EDITOR = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson}, ISBN = {}, MONTH = aug, PAGES = {}, PUBLISHER = {}, SITE = {Portland, Oregon}, TITLE = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}}, YEAR = {2012} } @BOOK{text_patterns, author = "{Mike Scott and Christopher Tribble}", title = "{Textual Patterns, Key Words and Corpus Analysis in Language Education}", PAGES = {55-72}, publisher = "{John Benjamins Publishing Company}", year = "2006" } @MISC{ententen, key = "{Corpus}", title = "{Sketch Engine EnTenTen Corpus}", howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}", year = "2012", } @book{Introduction_to_information_retrieval, abstract = {Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective.}, added-at = {2012-05-30T10:50:27.000+0200}, address = {Cambridge, UK}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich}, biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}, file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL}, PAGES = {118-120}, groups = {public}, interhash = {b6954037b1d444f4afe4cad883b4d80c}, intrahash = {8516d94c1f7aa1e391ddd3ace4caa23b}, isbn = {978-0-521-86571-5}, keywords = {v1205 book ai information retrieval language processing search xml web}, publisher = {Cambridge University Press}, timestamp = {2012-05-30T10:50:27.000+0200}, title = {Introduction to Information Retrieval}, username = {flint63}, year = 2008 } @INPROCEEDINGS{Knight, author = {Allan Knight and Kevin Almeroth and Bruce Bimber}, title = {An Automated System for Plagiarism Detection Using the Internet}, booktitle = {Proceedings of World Conference on Educational Multimedia, Hypermedia and Telecommunications, pp. 3619-3625}, year = {2004}, } @INPROCEEDINGS{awfc, author = {Sven Meyer Zu Eissen and Benno Stein}, title = {Intrinsic Plagiarism Detection}, booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR-06)}, year = {2006} } @INPROCEEDINGS{Kasprzak2008, AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c} and Pavel {\v S}merk", TITLE = "Distributed System for Discovering Similar Documents", SUBTITLE = "From a Relational Database to the Custom-Developed Parallel solution", BOOKTITLE = "ICEIS 2008: Proceedings of the Tenth International Conference on Enterprise Information Systems, Vol. DISI---Databases and Informations Systems Integration", YEAR = "2008", publisher = "INSTICC (Institute for Systems and Technologies of Information, Control and Communication), Setúbal, Portugal", pages = "437--440", isbn = "978-989-8111-36-4", file = F } @INPROCEEDINGS{Kasprzak2009, AUTHOR = "Jan Kasprzak and Michal Brandejs and Jitka Brandejsov\'{a}", TITLE = "Distributed Aspects of the System for Discovering Similar Documents", BOOKTITLE = "ITA 09: Proceedings of the Third International Conference on Internet Technology and Applications", YEAR = "2009", file = F } @INPROCEEDINGS{Kasprzak2009a, AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}", TITLE = "Finding Plagiarism by Evaluating Document Similarities", BOOKTITLE = "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing", YEAR = "2009", file = F } @INPROCEEDINGS{Monostori2002, author = {Kriszti\'{a}n Monostori and Raphael A. Finkel and Arkady B. Zaslavsky and G\'{a}bor Hod\'{a}sz and M\'{a}t\'{e} Pataki}, title = {Comparison of Overlap Detection Techniques}, booktitle = {ICCS '02: Proceedings of the International Conference on Computational Science-Part I}, year = {2002}, isbn = {3-540-43591-3}, pages = {51--60}, publisher = {Springer-Verlag}, address = {London, UK}, } @MISC{theses.cz, key = {Theses.CZ}, title = "{Czech National Archive of Graduate Theses}", howpublished = "\url{http://theses.cz/}", year = "2008--2011", } @MISC{ismu, key = "{IS MU}", title = "{Masaryk University Information System}", howpublished = "\url{http://is.muni.cz/}", year = "1999--2011", } @MISC{odevzdej.cz, key = "{Odevzdej.CZ}", title = "{Odevzdej---the system for collecting seminar works}", howpublished = "\url{http://odevzdej.cz/}", year = "2009--2011", } @inproceedings{finkel2002, author = {Finkel, Raphael A. and Zaslavsky, Arkady and Monostori, Kriszti\'{a}n and Schmidt, Heinz}, title = {Signature extraction for overlap detection in documents}, booktitle = {ACSC '02: Proceedings of the twenty-fifth Australasian conference on Computer science}, year = {2002}, isbn = {0-909925-82-8}, pages = {59--64}, location = {Melbourne, Victoria, Australia}, publisher = {Australian Computer Society, Inc.}, address = {Darlinghurst, Australia}, } @INPROCEEDINGS{broder97, title={On the resemblance and containment of documents}, author={Broder, A.Z.}, booktitle={Compression and Complexity of Sequences 1997. Proceedings}, year={1997}, month={Jun}, volume={}, number={}, pages={21-29}, keywords={information retrieval, random processes, set theoryRabin fingerprints, World Wide Web, containment, documents, fixed size sample, informal notions, information retrieval, intersection problems, mathematical notions, mathematical properties, random sampling, resemblance, roughly contained, roughly the same}, doi={10.1109/SEQUEN.1997.666900}, ISSN={}, } @ARTICLE{RFC1321, author="{Rivest, R.}", title="{RFC1321: The MD5 Message-Digest Algorithm}", year = {1992}, publisher = {RFC Editor}, address = {United States}, note={\url{http://www.rfc-editor.org/rfc/rfc1321.txt}}, } @Misc{britannicaplagiarism, author = "Encyclop\ae{}dia Britannica", title = "Plagiarism", howpublished = "retrieved 2009--08--24 from \url{http://www.britannica.com/EBchecked/topic/462640/plagiarism}", year = 2009 } @Misc{idneszlin, author = "iDnes.CZ", title = "Zlínského děkana usvědčili z plagiátorství", howpublished = "retrieved 2009--08--25 from \url{http://zpravy.idnes.cz/studium.asp?c=A080709_085836_studium_bar}", year = 2008 } @inproceedings{pomikalek2008, author = "Pomikálek, Jan and Rychlý, Pavel", title = "Detecting Co-Derivative Documents in Large Text Collections", booktitle = "Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)", year = "2008", pages = "132-135", address = "Marrakech, Morocco", url = "http://www.lrec-conf.org/lrec2008/" } @inproceedings{pomikalek2009, author = "Pomikálek, Jan and Rychlý, Pavel and Kilgarriff, Adam", title = "Scaling to Billion-plus Word Corpora", booktitle = "Advances in Computational Linguistics", year = "2009", pages = "3-13", address = "Mexico", issn = "1870-4069", publisher = "Instituto Politécnico Nacional", } @Misc{munisearch, key = "MU", title = "Masaryk University: Full-text search", howpublished = "retrieved 2009--08--25 from \url{http://www.muni.cz/general/search}", year = 2009 } @Misc{coxslashdot, author = "Alan Cox", title = "{Alan Cox talks about laws... and Linux}", howpublished = "retrieved 2009--08--27 from \url{http://interviews.slashdot.org/article.pl?sid=02/05/20/1314214}", year = 2002 } @inproceedings{coderivative, Author = {Bernstein, Y and Zobel, J}, Title = {{A Scalable System for Identifying Co-derivative Documents}}, Booktitle = {{String Processing and Information Retrieval, Proceedings}}, Series = {{Lecture Notes in Computer Science}}, Year = {{2004}}, Volume = {{3246}}, Pages = {{55-67}}, Publisher = {{Springer-Verlag Berlin}}, Type = {{Proceedings Paper}}, Language = {{English}}, Affiliation = {{Bernstein, Y (Reprint Author), RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia. RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.}}, ISSN = {{0302-9743}}, ISBN = {{3-540-23210-9}}, Keywords-Plus = {{COMPRESSION}}, Subject-Category = {{Computer Science, Theory \& Methods}}, Author-Email = {{ybernste@cs.rmit.edu.au jz@cs.rmit.edu.au}}, Number-of-Cited-References = {{17}}, Times-Cited = {{2}}, Doc-Delivery-Number = {{BBA15}}, Unique-ID = {{ISI:000224377200006}}, } @MISC{turnitin, key = "{Turnitin.com}", title = "Turnitin", howpublished = "\url{http://turnitin.com/}, retrieved 2009--08--26", year = "2009", } @MISC{copyscape, key = "{Copyscape.com}", title = "Copyscape", howpublished = "\url{http://copyscape.com/}, retrieved 2009--08--26", year = "2009", } @MISC{doccop, key = "{Doccop.com}", title = "DocCop", howpublished = "\url{http://doccop.com/}, retrieved 2009--08--26", year = "2009", } @MISC{pan09competition, key = "{PAN'09}", title = "1st International Competition on Plagiarism Detection", howpublished = "\url{http://www.uni-weimar.de/medien/webis/research/workshopseries/pan-09/competition.html}, retrieved 2009--08--26", year = "2009", } @INPROCEEDINGS{Brin95copydetection, author = {Sergey Brin and James Davis and Hector Garcia-Molina}, title = {Copy Detection Mechanisms for Digital Documents}, booktitle = {Proceedings of the ACM SIGMOD Annual Conference}, year = {1995}, pages = {398--409} } @INPROCEEDINGS{Shivakumar95scam, author = {Narayanan Shivakumar and Hector Garcia-Molina}, title = {SCAM: A Copy Detection Mechanism for Digital Documents}, booktitle = {Proceedings of the Second Annual Conference on the Theory and Practice of Digital Libraries}, year = {1995} } @INPROCEEDINGS{Garcia-Molina96dscam:finding, author = {Hector Garcia-Molina and Luis Gravano and Narayanan Shivakumar}, title = {dSCAM: Finding Document Copies across Multiple Databases}, booktitle = {In Proceedings of the 4th International Conference on Parallel and Distributed Information Systems}, year = {1996} } @INPROCEEDINGS{LinuxDesktop, author = {Jan Kasprzak}, title = "{Desktop a jádro Linuxu}", booktitle = {Proceedings of the XXXI EurOpen.CZ Conference}, isbn = "978-80-86583-13-6", pages = {45--60}, publisher = "EurOpen.CZ, Plzeň", year = {2007} } @INPROCEEDINGS{Filesystems, author = {Jan Kasprzak}, title = "{Co umí souborové systémy}", booktitle = {Proceedings of the XXXII EurOpen.CZ Conference}, isbn = "978-80-86583-14-3", pages = {105--118}, publisher = "EurOpen.CZ, Plzeň", year = {2008} } @INPROCEEDINGS{GitEuropen, author = {Jan Kasprzak}, title = "{Git aneb správa verzí trochu jinak}", booktitle = {Proceedings of the XXXIV EurOpen.CZ Conference}, isbn = "978-80-86583-16-7", pages = {107--118}, publisher = "EurOpen.CZ, Plzeň", year = {2009} } @INPROCEEDINGS{Clusters, author = {Jan Kasprzak}, title = "{Clusterová řešení pod Linuxem}", booktitle = {SLT 2001: Proceedings of the 2nd Seminar on Linux and \TeX}, isbn = "80-7302-009-2", pages = {161--168}, publisher = "Konvoj, Brno", year = {2001} } @MISC{pan09corpus, AUTHOR = {{Webis at Bauhaus-Universität Weimar} and {NLEL at Universidad Polytécnica de Valencia}}, HOWPUBLISHED = {\url{http://www.webis.de/research/corpora}}, TITLE = {{PAN Plagiarism Corpus 2009 (PAN-PC-09)}}, YEAR = {2009}, NOTE = {{Martin Potthast, Andreas Eiselt, Benno Stein, Alberto Barrón-Cedeño, and Paolo Rosso (editors)}} } @INPROCEEDINGS{ngram, author = {William B. Cavnar and John M. Trenkle}, title = {N-Gram-Based Text Categorization}, booktitle = {In Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval}, year = {1994}, pages = {161--175} } @inproceedings{intrinsic, author = {Sven Meyer zu Eissen and Benno Stein}, booktitle = {ECIR}, editor = {Mounia Lalmas and Andy MacFarlane and Stefan M. Rüger and Anastasios Tombros and Theodora Tsikrika and Alexei Yavlinsky}, pages = {565-569}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Intrinsic Plagiarism Detection.}, url = {\url{http://dblp.uni-trier.de/db/conf/ecir/ecir2006.html#EissenS06}}, volume = {3936}, year = {2006}, ee = {http://dx.doi.org/10.1007/11735106_66}, isbn = {3-540-33347-9}, date = {2006-04-03} } @INPROCEEDINGS{Heintze96scalabledocument, author = {Nevin Heintze}, title = {Scalable Document Fingerprinting}, booktitle = {In Proc. USENIX Workshop on Electronic Commerce}, year = {1996} } @inproceedings{suffixtree, author = {Manber, Udi and Myers, Gene}, title = {Suffix arrays: a new method for on-line string searches}, booktitle = {SODA '90: Proceedings of the first annual ACM-SIAM symposium on Discrete algorithms}, year = {1990}, isbn = {0-89871-251-3}, pages = {319--327}, location = {San Francisco, California, United States}, publisher = {Society for Industrial and Applied Mathematics}, address = {Philadelphia, PA, USA} } @techreport{rsync, KEY = "Rsync", AUTHOR = "Andrew Tridgell and Paul Mackerras", TITLE = "The rsync algorithm", DATE = "2004--05--19", YEAR = "2004", INSTITUTION = "Department of Computer Science, FEIT, Australian National university", NOTE = "\url{http://hdl.handle.net/1885/40765}", } @MISC{git, KEY = "Git", AUTHOR = "Linus Torvalds et al", TITLE = "{Git---the Fast Version Control System}", HOWPUBLISHED = "\url{http://git-scm.com/}, retrieved 2011--01--12", } @MISC{rpm, KEY = "RPM", TITLE = "{The RPM Package Manager}", HOWPUBLISHED = "\url{http://www.rpm.org/}, retrieved 2011-01--12", } @MISC{deltarpm, KEY = "DeltaRPM", TITLE = "{DeltaRPM}", HOWPUBLISHED = "\url{ftp://ftp.suse.com/pub/projects/deltarpm/}, retrieved 2011--01--12", } @mastersthesis{zazrivec, AUTHOR = "Milan Zázrivec", TITLE = "Algoritmus a implementace software pro tvorbu binárních záplat", SCHOOL = "Faculty of Informatics, Masaryk University", NOTE = "\url{http://is.muni.cz/th/60716/fi_m/}", YEAR = "2009", } @MISC{scholar, KEY = "Google Scholar", TITLE = "{Google Scholar}", HOWPUBLISHED = "\url{http://scholar.google.com/}, retrieved 2011--01--12", } @MISC{ascii, KEY = "ASCII", TITLE = "{American National Standard for Information Systems -- Coded Character Sets -- 7-Bit American National Standard Code for Information Interchange (7-Bit ASCII), ANSI X3.4-1986}", DATE="1986--03--26", YEAR="1986", INSTITUTION = "American National Standards Institute, Inc.", } @BOOK{unicode, AUTHOR = "The Unicode Consortium", TITLE = "{The Unicode Standard -- Version 4.0}", PUBLISHER = "Addison-Wesley, Boston, MA", YEAR = "2003", ISBN = "0--321--18578--1", NOTE = "\url{http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_4_0_0}", } @MISC{rfc3629, KEY = "RFC 3629", TITLE = "{RFC 3629: UTF-8, a transformation format of ISO 10646}", AUTHOR = "F. Yergeau", YEAR = "2003", NOTE = "\url{http://tools.ietf.org/html/rfc3629}", } @MISC{thesisproposal, TITLE = "{Systems for Discovering Similar Documents}", AUTHOR = "Jan Kasprzak", INSTITUTION = "Faculty of Informatics, Masaryk University", YEAR = "2010", NOTE = "Ph.D. thesis proposal, \url{http://is.muni.cz/th/1885/fi_r/}", } @inproceedings{Kasprzak2010, title={Improving the reliability of the plagiarism detection system}, author={Kasprzak, J. and Brandejs, M.}, booktitle={Notebook Papers of CLEF 2010 LABs and Workshops}, year={2010}, organization={Citeseer} } @article{stamatatos2011plagiarism, title={Plagiarism detection using stopword n-grams}, author={Stamatatos, E.}, journal={Journal of the American Society for Information Science and Technology}, year={2011}, publisher={Wiley Online Library} } @inproceedings{pan09stamatatos, author = {Efstathios Stamatatos}, pages = {38--46}, title = {Intrinsic Plagiarism Detection Using Character n-gram Profiles}, booktitle = {Proceedings of the SEPLN'09 Workshop on Uncovering Plagiarism, Authorship and Social Software Misuse}, year = {2009}, location = {San Sebastian (Donostia), Spain}, issn = {1613--0073}, } @article{zipf1935psycho, title={The psycho-biology of language.}, author={Zipf, G.K.}, year={1935}, publisher={Houghton, Mifflin} } @INPROCEEDINGS{potthastframework, TITLE = {{An Evaluation Framework for Plagiarism Detection} }, AUTHOR = {Martin Potthast and Benno Stein and Alberot Barr{\'o}n-Cede{\~n}o and Paolo Rosso}, BOOKTITLE = {Proceedings of the 23rd International Conference on Computational Linguistics (COLING 2010)}, MONTH = aug, YEAR = {2010}, ADDRESS = {Beijing, China}, PUBLISHER = {Association for Computational Linguistics}, }