Edit xml declaration encoding with java -
i editing xml-file original encoding ascii in declaration. in resulting file want encoding utf-8 in order write swedish characters åäö, can't @ moment.
an example file equivalent file can found @ archivematica wiki.
the resulting sip.xml after running program copy of above example file can reached @ this link. added tag åäö text in end of document.
as seen in code below have tried setting encoding on transformer, , tried use outputstreamwriter set encoding. in end edited declaration in original file utf-8 , åäö written out. problem seems encoding of original file. if i'm not mistaken shouldn't cause problem change declaration ascii utf-8, question is, how do within program? can after parsing document object, or need before parsing?
package provklasser; import java.io.file; import java.io.ioexception; import java.util.logging.level; import java.util.logging.logger; import javax.swing.joptionpane; import javax.xml.parsers.documentbuilder; import javax.xml.parsers.documentbuilderfactory; import javax.xml.parsers.parserconfigurationexception; import javax.xml.transform.outputkeys; import javax.xml.transform.transformer; import javax.xml.transform.transformerconfigurationexception; import javax.xml.transform.transformerexception; import javax.xml.transform.transformerfactory; import javax.xml.transform.dom.domsource; import javax.xml.transform.stream.streamresult; import org.w3c.dom.document; import org.w3c.dom.element; import org.xml.sax.saxexception; /** * * @author */ public class provklass { /** * @param args command line arguments */ public static void main(string[] args) { try { file chosenfile = new file("myfile.xml"); //parsing xml file documentbuilderfactory factory = documentbuilderfactory.newinstance(); factory.setnamespaceaware(true); documentbuilder builder = factory.newdocumentbuilder(); document metsdoc = builder.parse(chosenfile.getabsolutepath()); element agent = (element) metsdoc.getdocumentelement().appendchild(metsdoc.createelementns("http://www.loc.gov/mets/","mets:agent")); agent.appendchild(metsdoc.createtextnode("åäö")); domsource source = new domsource(metsdoc); // write content xml file file newfile = new file(chosenfile.getparent(), "sip.xml"); transformerfactory transformerfactory = transformerfactory.newinstance(); transformer transformer = transformerfactory.newtransformer(); transformer.setoutputproperty(outputkeys.encoding, "utf-8"); streamresult result = new streamresult(newfile); //writer out = new outputstreamwriter(new fileoutputstream("sip.xml"), "utf-8"); //streamresult result = new streamresult(out); transformer.transform(source, result); } catch (parserconfigurationexception ex) { logger.getlogger(provklass.class.getname()).log(level.severe, null, ex); } catch (saxexception ex) { logger.getlogger(provklass.class.getname()).log(level.severe, null, ex); } catch (ioexception ex) { logger.getlogger(provklass.class.getname()).log(level.severe, null, ex); } catch (transformerconfigurationexception ex) { logger.getlogger(provklass.class.getname()).log(level.severe, null, ex); } catch (transformerexception ex) { logger.getlogger(provklass.class.getname()).log(level.severe, null, ex); } } }
update: using metsdoc.getinputencoding() returns utf-8, while metsdoc.getxmlencoding() returns ascii. if parse new file after saving , make new document same result. document seems have right encoding, xml declaration not right.
now edit xml text file before parsing it, replacing parsing part above parsexml(chosenfile.getabsoutepath());
, using following methods:
private string withediteddeclaration(string filename) { stringbuilder text = new stringbuilder(); try { string nl = system.getproperty("line.separator"); try (scanner scanner = new scanner(new fileinputstream(filename))) { string line = scanner.nextline(); text.append(line.replacefirst("ascii", "utf-8") + nl); while (scanner.hasnextline()) { text.append(scanner.nextline() + nl); } } } catch (filenotfoundexception ex) { logger.getlogger(metsadaption.class.getname()).log(level.severe, null, ex); } return text.tostring(); } private void parsexml(string filename) throws saxexception, ioexception, parserconfigurationexception { string xmlstring = withediteddeclaration(filename); //parsing xml file documentbuilderfactory factory = documentbuilderfactory.newinstance(); factory.setnamespaceaware(true); documentbuilder builder = factory.newdocumentbuilder(); inputsource = new inputsource(); is.setcharacterstream(new stringreader(xmlstring)); metsdoc = builder.parse(is); }
it works, seems ugly solution. i'd grateful if knew better way.
i had similar issue xml declaration originally:
<?xml version="1.0" encoding="windows-1252"?>
but after parsing document
, xml
utf-8
encoding stayed windows-1252
though bytes utf-8
. worked out implementation of transformerfactory
com.sun.org.apache.xalan.internal.xsltc.trax.transformerfactoryimpl
changing to:
org.apache.xalan.processor.transformerfactoryimpl
from apache xalan java 2.7.1 resulted in charset in xml deceleration being correctly set , have:
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
Comments
Post a Comment