{"id":"https://openalex.org/W2950686248","doi":"https://doi.org/10.1093/bioinformatics/bty733","title":"SpaRC: scalable sequence clustering using Apache Spark","display_name":"SpaRC: scalable sequence clustering using Apache Spark","publication_year":2018,"publication_date":"2018-08-22","ids":{"openalex":"https://openalex.org/W2950686248","doi":"https://doi.org/10.1093/bioinformatics/bty733","mag":"2950686248","pmid":"https://pubmed.ncbi.nlm.nih.gov/30816928"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1093/bioinformatics/bty733","pdf_url":null,"source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311647","https://openalex.org/P4310311648"],"host_organization_lineage_names":["University of Oxford","Oxford University Press"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.osti.gov/biblio/1542383","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5051631233","display_name":"Lizhen Shi","orcid":"https://orcid.org/0000-0002-8776-4470"},"institutions":[{"id":"https://openalex.org/I103163165","display_name":"Florida State University","ror":"https://ror.org/05g3dte14","country_code":"US","type":"education","lineage":["https://openalex.org/I103163165"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lizhen Shi","raw_affiliation_strings":["Department of Computer Science, School of Computer Science, Florida State University, Tallahassee, FL, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, School of Computer Science, Florida State University, Tallahassee, FL, USA","institution_ids":["https://openalex.org/I103163165"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077701341","display_name":"Xiandong Meng","orcid":"https://orcid.org/0000-0002-3382-5274"},"institutions":[{"id":"https://openalex.org/I196679689","display_name":"Joint Genome Institute","ror":"https://ror.org/04xm1d337","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I196679689","https://openalex.org/I39565521"]},{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiandong Meng","raw_affiliation_strings":["Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, Berkeley, CA, USA","US Department of Energy, Joint Genome Institute, Walnut Creek, CA, USA"],"affiliations":[{"raw_affiliation_string":"US Department of Energy, Joint Genome Institute, Walnut Creek, CA, USA","institution_ids":["https://openalex.org/I196679689"]},{"raw_affiliation_string":"Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, Berkeley, CA, USA","institution_ids":["https://openalex.org/I148283060"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055161576","display_name":"Elizabeth Tseng","orcid":"https://orcid.org/0000-0002-1074-5095"},"institutions":[{"id":"https://openalex.org/I68554272","display_name":"Pacific Biosciences (United States)","ror":"https://ror.org/00fcszb13","country_code":"US","type":"company","lineage":["https://openalex.org/I68554272"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Elizabeth Tseng","raw_affiliation_strings":["Pacific Biosciences Inc, Menlo Park, CA, USA"],"affiliations":[{"raw_affiliation_string":"Pacific Biosciences Inc, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I68554272"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071387306","display_name":"Michael Mascagni","orcid":"https://orcid.org/0000-0003-3058-4580"},"institutions":[{"id":"https://openalex.org/I103163165","display_name":"Florida State University","ror":"https://ror.org/05g3dte14","country_code":"US","type":"education","lineage":["https://openalex.org/I103163165"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Mascagni","raw_affiliation_strings":["Department of Computer Science, School of Computer Science, Florida State University, Tallahassee, FL, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, School of Computer Science, Florida State University, Tallahassee, FL, USA","institution_ids":["https://openalex.org/I103163165"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100781872","display_name":"Zhong Wang","orcid":"https://orcid.org/0000-0002-6307-0458"},"institutions":[{"id":"https://openalex.org/I196679689","display_name":"Joint Genome Institute","ror":"https://ror.org/04xm1d337","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I196679689","https://openalex.org/I39565521"]},{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]},{"id":"https://openalex.org/I156087764","display_name":"University of California, Merced","ror":"https://ror.org/00d9ah105","country_code":"US","type":"education","lineage":["https://openalex.org/I156087764"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zhong Wang","raw_affiliation_strings":["Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, Berkeley, CA, USA","School of Natural Sciences, University of California at Merced, Merced, CA, USA","US Department of Energy, Joint Genome Institute, Walnut Creek, CA, USA"],"affiliations":[{"raw_affiliation_string":"US Department of Energy, Joint Genome Institute, Walnut Creek, CA, USA","institution_ids":["https://openalex.org/I196679689"]},{"raw_affiliation_string":"Environmental Genomics and Systems Biology Division, Lawrence Berkeley National Laboratory, Berkeley, CA, USA","institution_ids":["https://openalex.org/I148283060"]},{"raw_affiliation_string":"School of Natural Sciences, University of California at Merced, Merced, CA, USA","institution_ids":["https://openalex.org/I156087764"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100781872"],"corresponding_institution_ids":["https://openalex.org/I196679689","https://openalex.org/I148283060","https://openalex.org/I156087764"],"apc_list":{"value":3618,"currency":"USD","value_usd":3618,"provenance":"doaj"},"apc_paid":null,"fwci":2.102,"has_fulltext":true,"fulltext_origin":"ngrams","cited_by_count":27,"citation_normalized_percentile":{"value":0.658768,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":93,"max":94},"biblio":{"volume":"35","issue":"5","first_page":"760","last_page":"768"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11791","display_name":"Microbial Community Ecology and Physiology","score":0.9895,"subfield":{"id":"https://openalex.org/subfields/2303","display_name":"Ecology"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10887","display_name":"Bioinformatics and Genomic Networks","score":0.9894,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.82741416},{"id":"https://openalex.org/keywords/sequence-assembly","display_name":"Sequence assembly","score":0.5299215},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.4538771}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.8546289},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.82741416},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.77165806},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.68676245},{"id":"https://openalex.org/C18949551","wikidata":"https://www.wikidata.org/wiki/Q740578","display_name":"Sequence assembly","level":5,"score":0.5299215},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.51578116},{"id":"https://openalex.org/C15151743","wikidata":"https://www.wikidata.org/wiki/Q903778","display_name":"Metagenomics","level":3,"score":0.5147143},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.5013771},{"id":"https://openalex.org/C101985253","wikidata":"https://www.wikidata.org/wiki/Q1073526","display_name":"Shotgun sequencing","level":4,"score":0.49177173},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.4538771},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35973266},{"id":"https://openalex.org/C51679486","wikidata":"https://www.wikidata.org/wiki/Q380546","display_name":"DNA sequencing","level":3,"score":0.35425177},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.20828521},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.20677215},{"id":"https://openalex.org/C162317418","wikidata":"https://www.wikidata.org/wiki/Q252857","display_name":"Transcriptome","level":4,"score":0.1981745},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.187466},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.16533026},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.15874472},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.085461825},{"id":"https://openalex.org/C150194340","wikidata":"https://www.wikidata.org/wiki/Q26972","display_name":"Gene expression","level":3,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":"","qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D012984","descriptor_name":"Software","qualifier_ui":"","qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D016000","descriptor_name":"Cluster Analysis","qualifier_ui":"","qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D059014","descriptor_name":"High-Throughput Nucleotide Sequencing","qualifier_ui":"","qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D056186","descriptor_name":"Metagenomics","qualifier_ui":"","qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D017422","descriptor_name":"Sequence Analysis, DNA","qualifier_ui":"","qualifier_name":null,"is_major_topic":false}],"locations_count":6,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1093/bioinformatics/bty733","pdf_url":null,"source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311647","https://openalex.org/P4310311648"],"host_organization_lineage_names":["University of Oxford","Oxford University Press"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://www.osti.gov/biblio/1542383","pdf_url":null,"source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":["Office of Scientific and Technical Information"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://www.osti.gov/biblio/1471135","pdf_url":null,"source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":["Office of Scientific and Technical Information"],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://escholarship.org/uc/item/7dn7m5rg","pdf_url":"https://escholarship.org/content/qt7dn7m5rg/qt7dn7m5rg.pdf?t=pqrpyz","source":{"id":"https://openalex.org/S4306400115","display_name":"eScholarship (California Digital Library)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I2801248553","host_organization_name":"California Digital Library","host_organization_lineage":["https://openalex.org/I2801248553"],"host_organization_lineage_names":["California Digital Library"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://doi.org/10.1101/246496","pdf_url":"https://www.biorxiv.org/content/biorxiv/early/2018/01/11/246496.full.pdf","source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/30816928","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":["National Institutes of Health"],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://www.osti.gov/biblio/1542383","pdf_url":null,"source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":["Office of Scientific and Technical Information"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.4}],"grants":[{"funder":"https://openalex.org/F4320337509","funder_display_name":"Biological and Environmental Research","award_id":"DE-AC02-05CH11231"}],"datasets":[],"versions":[],"referenced_works_count":28,"referenced_works":["https://openalex.org/W1969346416","https://openalex.org/W1971899779","https://openalex.org/W1972924519","https://openalex.org/W2057253402","https://openalex.org/W2096128575","https://openalex.org/W2107974372","https://openalex.org/W2112071632","https://openalex.org/W2113679889","https://openalex.org/W2124110196","https://openalex.org/W2125266506","https://openalex.org/W2127175247","https://openalex.org/W2129504003","https://openalex.org/W2130669817","https://openalex.org/W2132202037","https://openalex.org/W2150208009","https://openalex.org/W2159591897","https://openalex.org/W2194172909","https://openalex.org/W2212407137","https://openalex.org/W2399830453","https://openalex.org/W2523288541","https://openalex.org/W2525711135","https://openalex.org/W2527795289","https://openalex.org/W2531396638","https://openalex.org/W2599417231","https://openalex.org/W2602978558","https://openalex.org/W2731251882","https://openalex.org/W2773939681","https://openalex.org/W958219903"],"related_works":["https://openalex.org/W4389452281","https://openalex.org/W4238948313","https://openalex.org/W3183189600","https://openalex.org/W3134948850","https://openalex.org/W3111832393","https://openalex.org/W3043668926","https://openalex.org/W3036436206","https://openalex.org/W3000916974","https://openalex.org/W2987572536","https://openalex.org/W2573498001"],"abstract_inverted_index":{"Whole":[0],"genome":[1],"shotgun":[2],"based":[3,78],"next-generation":[4,150],"transcriptomics":[5],"and":[6,35,46,55,96,101,114,126,153],"metagenomics":[7],"studies":[8],"often":[9,43],"generate":[10],"100-1000":[11],"GB":[12],"sequence":[13,70,167],"data":[14,29,112,168],"derived":[15],"from":[16,98,149],"tens":[17],"of":[18,20,27,82,116,147],"thousands":[19],"different":[21],"genes":[22,61],"or":[23,62],"microbial":[24],"species.":[25],"Assembly":[26],"these":[28],"sets":[30],"requires":[31],"tradeoffs":[32],"between":[33],"scalability":[34,42,109],"accuracy.":[36],"Current":[37],"assembly":[38,87],"methods":[39],"optimized":[40],"for":[41,59,144,164],"sacrifice":[44],"accuracy":[45,58],"vice":[47],"versa.":[48],"An":[49],"ideal":[50],"solution":[51,143,159],"would":[52],"both":[53,99,123],"scale":[54],"produce":[56],"optimal":[57],"individual":[60],"genomes.Here":[63],"we":[64],"describe":[65],"an":[66],"Apache":[67,154],"Spark-based":[68],"scalable":[69,142],"clustering":[71,92,145],"application,":[72],"SparkReadClust":[73],"(SpaRC),":[74],"that":[75,138],"partitions":[76],"reads":[77,148],"on":[79,94,122],"their":[80],"molecule":[81],"origin":[83],"to":[84],"enable":[85],"downstream":[86],"optimization.":[88],"SpaRC":[89,119,139],"produces":[90],"high":[91],"performance":[93],"transcriptomes":[95],"metagenomes":[97],"short":[100],"long":[102],"read":[103],"sequencing":[104,151],"technologies.":[105],"It":[106],"achieves":[107],"near-linear":[108],"with":[110,160],"input":[111],"size":[113],"number":[115],"compute":[117],"nodes.":[118],"can":[120],"run":[121],"cloud":[124],"computing":[125],"HPC":[127],"environments":[128],"without":[129],"modification":[130],"while":[131],"delivering":[132],"similar":[133,165],"performance.":[134],"Our":[135],"results":[136],"demonstrate":[137],"provides":[140],"a":[141,157],"billions":[146],"experiments,":[152],"Spark":[155],"represents":[156],"cost-effective":[158],"rapid":[161],"development/deployment":[162],"cycles":[163],"large-scale":[166],"analysis":[169],"problems.https://bitbucket.org/berkeleylab/jgi-sparc.":[170]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2950686248","counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":12},{"year":2019,"cited_by_count":6}],"updated_date":"2025-01-07T18:58:57.354021","created_date":"2019-06-27"}